In [1]:
import pandas as pd
import numpy as np 
import glob
from biopandas.pdb import PandasPdb
import warnings
warnings.filterwarnings("ignore")
from get_distances import *
import sys
from multiprocessing import Pool
import time
import math

In [7]:
psp_data = pd.read_csv("/people/imal967/git_repos/pheno_analysis/test_psp.csv")
pockets_data = pd.read_csv("/people/imal967/git_repos/pheno_analysis/pockets_data.csv")
output_location = "/people/imal967/git_repos/pheno_analysis/merged_pockets.csv"

In [9]:
def run_parallel_pockets():
    #adding columns to psp data

    psp_data['closest_pocket'] = "NaN"
    psp_data['inside_pocket'] = 0
    psp_data['distance_from_pocket'] = "NaN"

    # get all of the unique uniprots
    unique_uniprots = psp_data['uniprot_id'].unique() 

    start_time = time.perf_counter()
    with Pool(2) as pool:
        output = pool.map(find_pockets_per_uniprot, unique_uniprots)
    finish_time = time.perf_counter()
    
    # save the csv and output the start and end times
    print("Program finished in {} seconds - using multiprocessing".format(finish_time-start_time))
    print(type(output))
    output.to_csv(output_location)

In [10]:
'''
this function does pockets calcuations for each uniprot tthat it is given
'''

# for each unique uniprotID...
# for uniprot in unique_uniprots:
def find_pockets_per_uniprot(uniprot):

    # isolate to psp and pockets in each uniprot
    psp_only_uniprot = psp_data[psp_data.uniprot_id == uniprot]
    pocket_only_uniprot = pockets_data[pockets_data.uniprot_id == uniprot]


    # parse your structure here
    pdb_path = "/rcfs/projects/proteometer/alphafold_swissprot_pdb"
    pdb_name = glob.glob("/rcfs/projects/proteometer/alphafold_swissprot_pdb/*" + uniprot + "*")
    print("name of pdb is:", pdb_name)
    if pdb_name:  
        ppdb = PandasPdb()  
        ppdb.read_pdb(pdb_name[0])


    # for each psp
        for phosphosite_row_index in psp_only_uniprot.index:
            #print(psp_only_uniprot)
            #print(phosphosite_row_index)
            residue_num = psp_only_uniprot.loc[phosphosite_row_index,'res_number'] # finding the residue number of the psp
            min_dist = 100000000000000000000000000000000 # make min dist extremely high at first
            #print(residue_num)
            # use the residue # to get the coordinates in space from pdb file
            
            
            for pocket_index in pocket_only_uniprot.index : # get all the residues in all of the pockets 
                pocket_residues = pocket_only_uniprot.loc[pocket_index,'pocket_resid']

                # check if it's inside of a pocket
                pocket_residues = pocket_residues[1:-1].split(",") # format the pocket_residues because it's a string
                print(pocket_residues)
                if residue_num in pocket_residues:
                    psp_data.loc[phosphosite_row_index,'inside_pocket'] = 1 # if residue is in the pocket, put 1 in the inside pocket column
                    psp_data.loc[phosphosite_row_index,'closest_pocket'] = pocket_only_uniprot.loc[pocket_index,'full_id'] # put unique pocketID in closest pocket
                    psp_data.loc[phosphosite_row_index,'distance_from_pocket'] = 0 
                    break # break because you don't want to contiue looking for pockets (and therefore overwrite the inside pocket and closest pocket)

                if psp_data.loc[phosphosite_row_index,'inside_pocket'] == 0: # if the phosphosite isn't in any pockets
                    #print("phosphosite isn't in any pockets")
                    input_struct = ppdb.df['ATOM']
                    #print(input_struct)
                    new_dist = find_mean_distances(input_struct, residue_num, pocket_residues)
                    if residue_num:
                        if min_dist > new_dist: # if this is the smallest distance so far, replace min_dist with new_dist
                            psp_data.loc[phosphosite_row_index,'closest_pocket'] = pocket_only_uniprot.loc[pocket_index,'full_id'] # put unique pocketID in closest pocket
                            psp_data.loc[phosphosite_row_index,'distance_from_pocket'] = new_dist # replace distance_from_pocket with min_dist
                            min_dist = new_dist 
                            print("added smallest distance:", min_dist)
                
    else: # if we can't find the pdb file
        for phosphosite_row_index in psp_only_uniprot.index:
            psp_data.loc[phosphosite_row_index,'inside_pocket'] = 'NaN' 
            psp_data.loc[phosphosite_row_index,'closest_pocket'] = 'NaN' 
            psp_data.loc[phosphosite_row_index,'distance_from_pocket'] = 'NaN'

    return(psp_data)

In [11]:
run_parallel_pockets()

name of pdb is: ['/rcfs/projects/proteometer/alphafold_swissprot_pdb/AF-P22626-F1-model_v4.pdb']
name of pdb is: ['/rcfs/projects/proteometer/alphafold_swissprot_pdb/AF-P15559-F1-model_v4.pdb']
['51', '68', '69', '70', '72', '73', '114', '117', '118', '119', '120', '121', '122', '123', '127', '129', '171', '175', '176', '179', '180']
['109', '110', '111', '112', '113', '115', '142', '143', '144', '145', '146', '147', '148', '153', '155', '157', '159', '187', '188', '189', '190', '192', '193', '194', '197']
added smallest distance: 14.906281503598533
['133', '134', '135', '138', '139', '140', '141', '178', '181', '182', '183', '184', '219', '220', '221', '222', '223', '224', '225', '226', '272', '273', '274']
added smallest distance: 32.85941890089949
['17', '18', '19', '20', '22', '24', '49', '51', '53', '64', '66', '96', '97', '98', '99', '100', '101', '102', '104', '108', '170', '171', '172', '173']
added smallest distance: 10.277050695641515
['132', '133', '134', '170', '174', '178'

AttributeError: 'list' object has no attribute 'to_csv'