In [1]:
# Load required packages.
import os
import torch
import numpy as np
import pandas as pd

from plyfile import PlyData, PlyElement
from helper_functions import load_object, save_object


In [2]:
#path = 'C:/Users/thoma/Desktop/ZHAW MLS/David prod/ADLProtProject/surface'
path = 'C:/Users/david/pyproj/pyg/adl/surfaces'

In [3]:
#List file on so 
from os import walk
def list_files(mpath):
  f = []
  for (dirpath, dirnames, filenames) in walk(mpath):
      f.extend(filenames)
      break
  return(f)

In [4]:
# I think this would do the trick
files_ply = os.listdir(path)

In [5]:
def read_ply_file(folder_path,file_name,requirement):

    plydata = PlyData.read(folder_path + '/' + file_name)
    
    feature = pd.DataFrame(plydata.elements[0].data)
    vertex_indices = pd.DataFrame(plydata.elements[1].data)

    if requirement == 'feature':
        return feature
    if requirement == 'vertex':
        return vertex_indices
    if requirement == 'plyfile':
        return plydata
    if requirement == 'all' :
        return [feature,vertex_indices]

In [6]:
def read_ply_folder(folder_path):

    files_ply = list_files(path)
    features_prot = [read_ply_file(folder_path,ply_name,'feature')  for ply_name in files_ply]
    vertex_indices = [read_ply_file(folder_path,ply_name,'vertex')  for ply_name in files_ply]
    
    return features_prot,vertex_indices,files_ply


def complex_separation(files_ply,df):
    
    complexes_names = []
    complexes = []
    uniq_names= []
    for f in files_ply :
        sh = f.find('_')
        val = f[0:sh]
        if val not in uniq_names:
            uniq_names.append(val)
    for name in uniq_names:
        prot = []
        prot_names = []
        max_len = 0

        for i in range(len(files_ply)):
            sh = files_ply[i].find('_')
            val = files_ply[i][0:sh]
            if name == val:
                us = len(files_ply[i])
                len_c = len(files_ply[i][sh:us])
                if len_c > max_len:
                    prot.append(df[i])
                    prot_names.append(files_ply[i])
                    max_len = len_c
                else:
                    prot.insert(0,df[i])
                    prot_names.insert(0,files_ply[i])

        complexes.append(prot)
        complexes_names.append(prot_names)
        
    return  complexes,complexes_names
        
            

feature,vetex,ply_files = read_ply_folder(path)
complexes_feature,complexes_names = complex_separation(ply_files,feature)
print(complexes_names)
print(feature)

[['1A99_D.ply', '1A99_C.ply', '1A99_CD.ply'], ['1ACB_I.ply', '1ACB_E.ply', '1ACB_EI.ply'], ['1AGQ_D.ply', '1AGQ_C.ply', '1AGQ_CD.ply'], ['1ARZ_C.ply', '1ARZ_A.ply', '1ARZ_AC.ply'], ['1AXI_B.ply', '1AXI_A.ply', '1AXI_AB.ply'], ['1B65_D.ply', '1B65_B.ply', '1B65_BD.ply'], ['1B8A_B.ply', '1B8A_A.ply', '1B8A_AB.ply'], ['1BCP_L.ply', '1BCP_H.ply', '1BCP_HL.ply'], ['1BGX_T.ply', '1BGX_HL.ply'], ['1BK6_E.ply', '1BK6_B.ply', '1BK6_BE.ply'], ['1BVK_F.ply', '1BVK_DE.ply', '1BVK_DEF.ply'], ['1BVN_T.ply', '1BVN_P.ply', '1BVN_PT.ply'], ['1C8N_C.ply', '1C8N_A.ply', '1C8N_AC.ply'], ['1C9S_H.ply', '1C9S_G.ply', '1C9S_GH.ply'], ['1CB7_D.ply', '1CB7_B.ply', '1CB7_BD.ply'], ['1CBW_D.ply', '1CBW_ABC.ply', '1CBW_ABCD.ply'], ['1CS0_B.ply', '1CS0_A.ply'], ['1D2Z_B.ply', '1D2Z_A.ply', '1D2Z_AB.ply'], ['1DBQ_B.ply', '1DBQ_A.ply', '1DBQ_AB.ply'], ['1DBW_B.ply', '1DBW_A.ply', '1DBW_AB.ply'], ['1DC1_B.ply', '1DC1_A.ply', '1DC1_AB.ply'], ['1DDZ_B.ply', '1DDZ_A.ply', '1DDZ_AB.ply'], ['1DE4_C.ply', '1DE4_A.ply'], ['

In [1]:
from c_ProteinGraph import ProteinGraph
from torch_geometric.utils import dense_to_sparse

source_dir = path
dest_dir = 'C:/Users/david/pyproj/pyg/adl/surfaces_graphs'

for complex in complexes_names: 

    if len(complex) != 3: # Check if there are three files of the complex
        continue
    else:
        for file in complex[0:2]: # only do it for the subunits, not for the complex
            name = file[0:-4]

            if f'{name}.pkl' not in os.listdir(dest_dir): #Only do it for proteins that are not yet in source dir

                features, edges = read_ply_file(path, file,'all')
                    
                pos = features.loc[:,["x","y","z"]].to_numpy()

                features = features.loc[:,["charge","hbond","hphob", "iface", 'nx', 'ny', 'nz']]

                    
                ## Translate edges into a adjacency matrix and then edge_index
                triangles = np.asarray([list(triangle[0]) for _,triangle in edges.iterrows()])
                n_nodes = pos.shape[0]
                adj = np.identity(n_nodes)
                for n1, n2, n3 in triangles:
                        
                    # edge1
                    adj[n1][n2] = 1
                    adj[n2][n1] = 1 # set location in adj = 1

                    #edge2
                    adj[n1][n3] = 1
                    adj[n3][n1] = 1

                    #edge3
                    adj[n2][n3] = 1
                    adj[n3][n2] = 1

                adj = torch.from_numpy(adj)
                edge_index, _ = dense_to_sparse(adj)


                protein = ProteinGraph(features=features, pos=pos, edge_index=edge_index, name=name)
                save_object(protein, dest_dir + f'/{name}.pkl')

NameError: name 'path' is not defined

In [None]:
import os.walk

path = 'C:/Users/thoma/OneDrive - ZHAW/ProteinSurfaces'

load_object(filename):
 


## Test what is in a pickle file now

In [None]:
dir = path + '/graphs'
test = load_object(dir + '/1A99_C.pkl')
print(test)

In [None]:
test.num_features()

In [None]:
test.num_nodes()

In [None]:
test.features

In [None]:
test.pos

In [None]:
test.edge_index

In [None]:
test.name

In [None]:
test.patches = ((edge_index, label, center_index), (edge_index, label ), ())