In [1]:
AA_dict = {
'ALA': 'A', 'ARG': 'R',
'ASN': 'N', 'ASP': 'D',
'CYS': 'C', 'GLU': 'E',
'GLN': 'Q', 'GLY': 'G',
'HIE': 'H', 'HID': 'H',
'HIS': 'H', 'ILE': 'I',
'LEU': 'L', 'LYS': 'K',
'MET': 'M', 'PHE': 'F',
'PRO': 'P', 'SER': 'S',
'THR': 'T', 'TRP': 'W',
'TYR': 'Y', 'VAL': 'V',
'SEC': 'U', 'PYL': 'O',
'CYX': 'C',
}

def pdb2seq(file):
    assert file.lower().endswith('.pdb'), "Accepts PDB files only!"
    seq_number_prev = -2
    aa_sequence = ''
    code_insert_residue_prev = None
    aa_seq_list = []
    aa_num_list = []
    with open(file, 'r') as f:
        for line in f:
            # only accepts ATOM for amino acid PDB
            # HETATM is not contained in PDB
            if not line.startswith('ATOM'):
                continue
            residue = line[17:20]
            # Residue sequence number + Code for insertions of residues
            seq_number = int(line[22:26])
            code_insert_residue = line[26:27]
            if seq_number_prev == seq_number and \
                code_insert_residue_prev == code_insert_residue:
                continue
            else:
                # do not repeat '/' in sequence
                # if sequence number stops / breaks from a continous list
                if seq_number_prev + 1 < seq_number and seq_number_prev > 0 \
                    and aa_sequence[-1] != '/':
                    aa_sequence += '/'
                aa_sequence += AA_dict[residue]
                aa_seq_list.append(AA_dict[residue])
                # if code_insert_residue appears
                if code_insert_residue != " " \
                    and aa_sequence[-1] != '/':
                    aa_sequence += '/'

                seq_number_prev = seq_number
                code_insert_residue_prev = code_insert_residue
                aa_num_list.append(str(seq_number)+code_insert_residue.strip())
    return aa_sequence.replace('/',''), aa_seq_list, aa_num_list

In [26]:
import numpy as np
import pandas as pd
import matplotlib as plt
from rdkit import Chem

In [35]:
import os
dir_name_1 = './data/refined-set'
dirs1 = os.listdir(dir_name_1)
with open('./protein_sequence.csv','w') as f:
    f.write('id'+',')
    f.write('sequence'+',')
    f.write('ligand'+'\n')
    for i in dirs1[:len(dirs1)-2]:
        filename = dir_name_1+'/%s/%s_protein.pdb'%(i, i)
        aa_sequence, aa_seq_list, aa_num_list = pdb2seq(filename)
        f.write(i+',')
        f.write(aa_sequence+',')
        file_mol=Chem.SDMolSupplier(dir_name_1+'/%s/%s_ligand.sdf'%(i,i))
        for x in file_mol:
            if x:
                smiles=Chem.MolToSmiles(x)
                f.write(smiles)
        f.write('\n')

In [36]:
dir_name_1 = './data/refined-set'
dirs1 = os.listdir(dir_name_1)
with open('./pocket_sequence.csv','w') as f:
    f.write('id'+',')
    f.write('sequence'+',')
    f.write('ligand'+'\n')
    for i in dirs1[:len(dirs1)-2]:
        filename = dir_name_1+'/%s/%s_pocket.pdb'%(i, i)
        aa_sequence, aa_seq_list, aa_num_list = pdb2seq(filename)
        f.write(i+',')
        f.write(aa_sequence+',')
        file_mol=Chem.SDMolSupplier(dir_name_1+'/%s/%s_ligand.sdf'%(i,i))
        for x in file_mol:
            if x:
                smiles=Chem.MolToSmiles(x)
                f.write(smiles)
        f.write('\n')

In [61]:
data1=pd.read_csv('./pocket_sequence.csv')
data2=pd.read_csv('./protein_sequence.csv')

In [66]:
for i in range(data1.shape[0]):
    if pd.isna(data1['ligand'][i]):
        data1=data1.drop([i])
        data2=data2.drop([i])

Unnamed: 0,id,sequence,ligand
0,10gs,YFPVRGRCLVTVEWQLKCYGQLPKYQSNTILERCIIYNPINGNGMD...,[NH3+]C(CCC(=O)NC(CSCc1ccccc1)C(=O)NC(C(=O)O)c...
1,11gs,YFPVRGRCALVTVEWQGLKCYGQLPKYQSNTILERCYISIYIPING...,CCC(CSCC(NC(=O)CCC([NH3+])C(=O)O)C(=O)NCC(=O)O...
2,13gs,YFPVRGRCVVTVETWQQLPSERCYISIYTNLPINGNGK,O=C(O)c1cc(/N=N/c2ccc(S(=O)(=O)Nc3ccccn3)cc2)c...
3,16pk,DNRHRRFGTAHRVGGAKVDKLGGAMAYKSKCPHVCFIMALDIWNGP...,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)C(F)(F)C...
4,184l,AVGILNKLKPVYDSLRRAAINMVFMGEGVAGFTNSLRMLQWANLWVIF,CC(C)Cc1ccccc1
...,...,...,...
19438,966c,FDGGNLAHAFAHFDEWYNLHRVAAHELGHSHSIGALMYPSYTFSV,O=C(CC1(S(=O)(=O)c2ccc(Oc3ccccc3)cc2)CCOCC1)NO
19439,9abp,KQEPWFECTPPVDDQLVMMAAIALDTARRNMNDSTINGVDLGSPHV,OCC1OC(O)C(O)C(O)C1O
19440,9hvp,RPLLLDTGADDTVLKMIGGIGGFIVLTPVNIIGRRLLDTGADDTVK...,CC(C)C(NC(=O)OCc1ccccc1)C(=O)NC(Cc1ccccc1)C(O)...
19441,9icd,GIGDVPLTIGIAATHGTAPKYKVNPGSITYDFRLCNIAQQLR,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)O)C(O)C1OP(=O)(O)O


In [68]:
data1.to_csv('pock_data.csv')
data2.to_csv('pro_data.csv')

In [69]:
l=0
for i in data1['sequence']:
   if len(i)>l:
       l=len(i)
l

129