In [1]:
from Bio.PDB import *
from Bio.PDB import PDBParser
from Bio.PDB.SASA import ShrakeRupley
from Bio.PDB.DSSP import DSSP
import json
import numpy as np
from torch_geometric.data import Data
import torch

In [None]:
AAPHY7 = json.load(open("aa_phy7.txt"))
blosum62 = json.load(open("BLOSUM62_dim23.txt"))

In [None]:
one_letter ={'VAL':'V', 'ILE':'I', 'LEU':'L', 'GLU':'E', 'GLN':'Q', \
'ASP':'D', 'ASN':'N', 'HIS':'H', 'TRP':'W', 'PHE':'F', 'TYR':'Y',    \
'ARG':'R', 'LYS':'K', 'SER':'S', 'THR':'T', 'MET':'M', 'ALA':'A',    \
'GLY':'G', 'PRO':'P', 'CYS':'C'}

In [None]:
def one_hot_encoding_unk(x, allowable_set):
    if x not in allowable_set:
        x = allowable_set[-1]
    return list(map(lambda s: x == s, allowable_set))

In [None]:
def pdb_to_graph(name, path):
    p = PDBParser()
    chain = p.get_structure(name, f"{path}{name}.pdb")[0]["A"]
    structure =  p.get_structure(name, f"{name}.pdb")
    model = structure[0]
    dssp = DSSP(model, f"{name}.pdb", dssp='../../../miniconda3/envs/rdkit/bin/mkdssp')

    features = []
    pos = []
    for index, res in enumerate(chain):
        resname = (one_letter[res.get_resname()])
        feature = np.array(list(dssp[index][3:6]) + 
        one_hot_encoding_unk(dssp[index][2], ['-', 'B', 'E', 'G', 'H', 'S', 'T']) +
        AAPHY7[resname] + 
        blosum62[resname])
        features.append(feature)        
        pos.append(np.array(list(res['CA'].get_vector())))
    features = torch.Tensor(np.array(features)).float()
    
    dis = np.array([np.sqrt(((p-pos)*(p-pos)).sum(-1))  for p in pos])
    edge_idx = torch.Tensor(np.array(np.where(dis<=6))).long()
    
    g = Data(x = features, 
             edge_idx=torch.Tensor(np.array(np.where(dis<=6))).long()
            )
    return g
    