In [2]:
import pandas as pd
import numpy as np 
import glob
from biopandas.pdb import PandasPdb
import warnings
warnings.filterwarnings("ignore")
from get_distances import *
import sys
from multiprocessing import Pool
import time
import math

In [30]:
psp_data = pd.read_csv("/people/imal967/git_repos/pheno_analysis/test_psp.csv").sample(n=3)
pockets_data = pd.read_csv("/people/imal967/git_repos/pheno_analysis/pockets_data.csv")
output_location = "/people/imal967/git_repos/pheno_analysis/merged_pockets.csv"

In [31]:
def run_parallel_pockets():
    #adding columns to psp data

    psp_data['closest_pocket'] = "NaN"
    psp_data['inside_pocket'] = 0
    psp_data['distance_from_pocket'] = "NaN"

    # get all of the unique uniprots
    unique_uniprots = psp_data['uniprot_id'].unique() 

    start_time = time.perf_counter()
    with Pool(10) as pool:
        output = pool.map(find_pockets_per_uniprot, unique_uniprots)
    finish_time = time.perf_counter()
    
    # save the csv and output the start and end times
    print("Program finished in {} seconds - using multiprocessing".format(finish_time-start_time))
    return(output)
    #print(type(output))
    #output.to_csv(output_location)

In [32]:
'''
this function does pockets calcuations for each uniprot tthat it is given
'''

# for each unique uniprotID...
# for uniprot in unique_uniprots:
def find_pockets_per_uniprot(uniprot):
    #print("start")
    # isolate to psp and pockets in each uniprot
    psp_only_uniprot = psp_data[psp_data.uniprot_id == uniprot]
    pocket_only_uniprot = pockets_data[pockets_data.uniprot_id == uniprot]


    # parse your structure here
    pdb_path = "/rcfs/projects/proteometer/alphafold_swissprot_pdb"
    pdb_name = glob.glob("/rcfs/projects/proteometer/alphafold_swissprot_pdb/*" + uniprot + "*")
    #print("name of pdb is:", pdb_name)
    if pdb_name:  
        ppdb = PandasPdb()  
        ppdb.read_pdb(pdb_name[0])


    # for each psp
        for phosphosite_row_index in psp_only_uniprot.index:
            #print(psp_only_uniprot)
            #print(phosphosite_row_index)
            residue_num = psp_only_uniprot.loc[phosphosite_row_index,'res_number'] # finding the residue number of the psp
            min_dist = 100000000000000000000000000000000 # make min dist extremely high at first
            #print(residue_num)
            # use the residue # to get the coordinates in space from pdb file
            
            
            for pocket_index in pocket_only_uniprot.index : # get all the residues in all of the pockets 
                pocket_residues = pocket_only_uniprot.loc[pocket_index,'pocket_resid']

                # check if it's inside of a pocket
                pocket_residues = pocket_residues[1:-1].split(",") # format the pocket_residues because it's a string
                #print(pocket_residues)
                if residue_num in pocket_residues:
                    psp_data.loc[phosphosite_row_index,'inside_pocket'] = 1 # if residue is in the pocket, put 1 in the inside pocket column
                    psp_data.loc[phosphosite_row_index,'closest_pocket'] = pocket_only_uniprot.loc[pocket_index,'full_id'] # put unique pocketID in closest pocket
                    psp_data.loc[phosphosite_row_index,'distance_from_pocket'] = 0 
                    break # break because you don't want to contiue looking for pockets (and therefore overwrite the inside pocket and closest pocket)

                if psp_data.loc[phosphosite_row_index,'inside_pocket'] == 0: # if the phosphosite isn't in any pockets
                    #print("phosphosite isn't in any pockets")
                    input_struct = ppdb.df['ATOM']
                    #print(input_struct)
                    new_dist = find_mean_distances(input_struct, residue_num, pocket_residues)
                    if residue_num:
                        if min_dist > new_dist: # if this is the smallest distance so far, replace min_dist with new_dist
                            psp_data.loc[phosphosite_row_index,'closest_pocket'] = pocket_only_uniprot.loc[pocket_index,'full_id'] # put unique pocketID in closest pocket
                            psp_data.loc[phosphosite_row_index,'distance_from_pocket'] = new_dist # replace distance_from_pocket with min_dist
                            min_dist = new_dist 
                            #print("added smallest distance:", min_dist)
                
    else: # if we can't find the pdb file
        for phosphosite_row_index in psp_only_uniprot.index:
            psp_data.loc[phosphosite_row_index,'inside_pocket'] = 'NaN' 
            psp_data.loc[phosphosite_row_index,'closest_pocket'] = 'NaN' 
            psp_data.loc[phosphosite_row_index,'distance_from_pocket'] = 'NaN'

    #print("end")

    return(psp_data)

In [33]:
list_to_export = run_parallel_pockets()

Program finished in 33.640095461159945 seconds - using multiprocessing


In [20]:
#pd.DataFrame(list_to_export[1]).to_csv(output_location)
list_to_export[1]

## Testing Interfaces Parallel

In [51]:
psp_data = pd.read_csv("/people/imal967/git_repos/pheno_analysis/test_psp.csv").sample(n = 10)
output_location = "/people/imal967/git_repos/pheno_analysis/merged_interfaces.csv"
psp_data

Unnamed: 0.1,Unnamed: 0,GENE,PROTEIN,ACC_ID,HU_CHR_LOC,MOD_RSD,SITE_GRP_ID,ORGANISM,MW_kD,DOMAIN,...,Ambiguous_Site,RES_NUM,PKA_ID,uniprotID,AA,res_number,pK,state,position,uniprot_id
23,390279,TIMM17A,TIMM17A,Q99595,1q32.1,K35-ub,15398750,human,18.02,TM; Tim17,...,0,35,Q99595_35,Q99595,LYS,35.0,10.229886,protonated,35.0,Q99595
36,383545,SMARCD3,SMARCD3,Q6STE5,7q36.1,K106-ub,41495511,human,55.02,,...,1,106,Q6STE5_106,Q6STE5,LYS,106.0,12.559181,protonated,106.0,Q6STE5
29,248238,TUBA1A,TUBA1A,Q71U36,12q13.12,S439-p,6835021,human,50.14,,...,0,439,Q71U36_439,Q71U36,SER,439.0,,undefined,439.0,Q71U36
35,425464,FBN1,FBN1,P35555,15q21.1,K599-sm,55430722,human,312.3,cEGF,...,0,599,P35555_599,P35555,LYS,599.0,10.663203,protonated,599.0,P35555
9,268177,ZNF550,ZNF550,Q7Z398,19q13.43,T254-p,1220282,human,48.38,,...,0,254,Q7Z398_254,Q7Z398,THR,254.0,,undefined,254.0,Q7Z398
6,92329,GOLGB1,GOLGB1,Q14789,3q13.33,T1260-p,5859716,human,376.02,,...,0,1260,Q14789_1260,Q14789,ASP,1260.0,3.383659,deprotonated,2460.0,Q14789
34,33564,CABLES2,CABLES2,Q9BTV7,20q13.33,S130-p,4270174,human,52.24,,...,0,130,Q9BTV7_130,Q9BTV7,SER,130.0,,undefined,130.0,Q9BTV7
48,375442,RIPK1,RIPK1,Q13546,6p25.2,K302-ub,964541188,human,75.93,,...,0,302,Q13546_302,Q13546,LYS,302.0,10.46496,protonated,302.0,Q13546
19,337372,H1-2,H1C,P16403,6p22.2,K52-ub,12253877,human,21.36,Linker_histone,...,0,52,P16403_52,P16403,LYS,52.0,10.561629,protonated,52.0,P16403
45,120300,LAMB1,LAMB1,P07942,7q31.1,Y220-p,20621765,human,198.04,Laminin_N,...,0,220,P07942_220,P07942,TYR,220.0,13.47212,protonated,220.0,P07942


In [59]:
def run_parallel_interfaces():
    #adding columns to psp data

    psp_data['closest_interface'] = ""
    psp_data['inside_interface'] = 0
    psp_data['distance_from_interface'] = np.nan

    # get all of the unique uniprots
    # unique_uniprots = psp_data['uniprot_id'].unique()
    unique_uniprot_psp = [psp_data[psp_data["uniprot_id"]==uniprot_id].copy() for uniprot_id in psp_data["uniprot_id"].unique()]

    start_time = time.perf_counter()
    with Pool(4) as pool:
        output = pool.map(find_interfaces_per_uniprot, unique_uniprot_psp)
    finish_time = time.perf_counter()
    
    # save the csv and output the start and end times
    print("Program finished in {} seconds - using multiprocessing".format(finish_time-start_time))
    return(output)


In [61]:
psp_data['closest_interface'] = "NaN"
psp_data['inside_interface'] = 0
psp_data['distance_from_interface'] = "NaN"

    # get all of the unique uniprots
    # unique_uniprots = psp_data['uniprot_id'].unique()
unique_uniprot_psp = [psp_data[psp_data["uniprot_id"]==uniprot_id].copy() for uniprot_id in psp_data["uniprot_id"].unique()]
start_time = time.perf_counter()
with Pool(4) as pool:
    output = pool.map(find_interfaces_per_uniprot, unique_uniprot_psp)
finish_time = time.perf_counter()

['', '12', '16', '77', '81', '82', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '121', '124', '125', '127', '128', '129', '130', '131', '132', '134', '135', '138', '13']
['06', '108', '112', '115', '116', '118', '119', '121', '122', '125', '126', '128', '129', '132', '133', '135', '136', '139', '140', '142', '143', '146', '149', '150', '151', '152', '153', '154', '270', '271', '273', '274', '275', '276', '278', '366', '368', '371', '372', '375', '381', '384', '385', '388', '389', '391', '392', '395', '396', '398', '399', '402', '403', '405', '406', '409', '410', '412', '413', '416', '41']
['1', '15', '71', '73', '74', '77', '98', '100', '101', '102', '180', '181', '182', '221', '224', '261', '262', '264', '265', '346', '389', '392', '393', '394', '396', '397', '398', '400', '403', '404', '407', '422', '423', '424', '426', '427', '429', '430', '431', '434', '435', '437', '438', '450', '45']
added smallest distance: 31.407578653509432
the interface is: Q71U36_Q99426
['1', 

ValueError: could not convert string to float: ''

In [60]:
'''
this function does interfaces calcuations for each uniprot tthat it is given
'''

# for each unique uniprotID...
# for uniprot in unique_uniprots:
def find_interfaces_per_uniprot(psp_only_uniprot):

    # isolate to psp and interfaces in each uniprot
    interfaces_data = pd.read_csv("/rcfs/projects/proteometer/ProtVar/predictions/interfaces/2024.05.28_interface_summary_5A.tsv", delimiter='\t', header=0)
    uniprot = psp_only_uniprot["uniprot_id"].to_list()[0]
    interface_only_uniprot = interfaces_data.loc[(interfaces_data['uniprot_id1'] == uniprot) | (interfaces_data['uniprot_id2'] == uniprot)] # isolate to uniprot in either 1 or 2


    # parse your structure here
    pdb_path = "/rcfs/projects/proteometer/alphafold_swissprot_pdb"
    pdb_name = glob.glob("/rcfs/projects/proteometer/alphafold_swissprot_pdb/*" + uniprot + "*")
    #print("name of pdb is:", pdb_name)
    if pdb_name:  
        ppdb = PandasPdb()  
        ppdb.read_pdb(pdb_name[0])


    # for each psp
        for phosphosite_row_index in psp_only_uniprot.index:
            residue_num = psp_only_uniprot.loc[phosphosite_row_index,'res_number'] # finding the residue number of the psp
            min_dist = 100000000000000000000000000000000 # make min dist extremely high at first
            #print(residue_num)
            # use the residue # to get the coordinates in space from pdb file
            
            
            for interface_index in interface_only_uniprot.index : # get all the residues in all of the interfaces 
                if pd.notna(interface_only_uniprot.loc[interface_index,'ifresid1']) & pd.notna(interface_only_uniprot.loc[interface_index,'ifresid1']):
                    if interfaces_data.loc[interface_index,'uniprot_id1'] == uniprot:
                        interface_residues = interface_only_uniprot.loc[interface_index,'ifresid1']
                    elif interfaces_data.loc[interface_index,'uniprot_id2'] == uniprot:
                        interface_residues = interface_only_uniprot.loc[interface_index,'ifresid2']
                    
                    # check if it's inside of a interface
                    interface_residues = interface_residues[1:-1].split(",") # format the interface_residues because it's a string
                    interface_residues = [e[1:] for e in interface_residues] # remove the first letter from each bc it includes residue type
                    print(interface_residues)
                    if residue_num in interface_residues:
                        psp_only_uniprot.loc[phosphosite_row_index,'inside_interface'] = 1 # if residue is in the interface, put 1 in the inside interface column
                        psp_only_uniprot.loc[phosphosite_row_index,'closest_interface'] = interface_only_uniprot.loc[interface_index,'interaction_id'] # put unique interfaceID in closest interface
                        psp_only_uniprot.loc[phosphosite_row_index,'distance_from_interface'] = 0 
                        break # break because you don't want to contiue looking for interfaces (and therefore overwrite the inside interface and closest interface)

                    if psp_only_uniprot.loc[phosphosite_row_index,'inside_interface'] == 0: # if the phosphosite isn't in any interfaces
                        #print("phosphosite isn't in any interfaces")
                        input_struct = ppdb.df['ATOM']
                        #print(input_struct)
                        new_dist = find_mean_distances(input_struct, residue_num, interface_residues)
                        if residue_num:
                            if min_dist > new_dist: # if this is the smallest distance so far, replace min_dist with new_dist
                                psp_only_uniprot.loc[phosphosite_row_index,'closest_interface'] = interface_only_uniprot.loc[interface_index,'interaction_id'] # put unique interfaceID in closest interface
                                psp_only_uniprot.loc[phosphosite_row_index,'distance_from_interface'] = new_dist # replace distance_from_interface with min_dist
                                min_dist = new_dist 
                                print("added smallest distance:", min_dist)
                                print("the interface is:", interface_only_uniprot.loc[interface_index,'interaction_id'])           
    else: # if we can't find the pdb file
        for phosphosite_row_index in psp_only_uniprot.index:
            psp_only_uniprot.loc[phosphosite_row_index,'inside_interface'] = 'NaN' 
            psp_only_uniprot.loc[phosphosite_row_index,'closest_interface'] = 'NaN' 
            psp_only_uniprot.loc[phosphosite_row_index,'distance_from_interface'] = 'NaN'
    #print(psp_data)
    return(psp_only_uniprot)



In [45]:
list_to_export = run_parallel_interfaces()
#list_to_export[1]

KeyError: 0