# Compute descriptors for Kai's structures

In [None]:
#! /hits/fast/mbm/treydewk/.venv/venv_up/bin/python
import numpy as np
from pathlib import Path
from dscribe.descriptors import LMBTR, SOAP
from ase.io import read
from rdkit import Chem, RDLogger
from rdkit.Chem import AllChem
import re
import pandas as pd

# Dscribe descriptors
species=["H", "O", "C", "N", "S"] # maybe add 'Y'
# Smooth overlap of atomic positions, Kai's parameters
soap = SOAP(
    species=species,
    periodic=False,
    rcut=6.0,
    nmax=8,
    lmax=6,
    sigma=1.0)
# Local many-body tensor representation, Kai's parameters
lmbtr = LMBTR(
    species=species,
    k2={
        "geometry": {"function": "distance"},
        "grid": {"min": 1, "max": 5.8, "n": 50, "sigma": 0.1},
        "weighting": {"function": "exp", "scale": 0.8, "cutoff": 1e-2, "threshold": 1e-3},
    },
    k3={
        "geometry": {"function": "angle"},
        "grid": {"min": 0, "max": 180, "n": 100, "sigma": 2},
        "weighting": {"function": "exp", "scale": 0.3, "cutoff": 1e-2, "threshold": 1e-3},
    },
    periodic=False,
    normalization="l2_each",
    flatten=True
)

data = pd.read_pickle('/hits/fast/mbm/treydewk/documentation/docs/data_complete_w_descriptors')

root = Path('/hits/basement/mbm/riedmiki/structures/KR0008/')

se_folders = list((root/'traj').glob('batch*/se'))

se_folders = se_folders + [root/'start_end_prod_1', root/'start_end_prod_2', root/'start_end_prod_3', root/'start_end_prod_4', root/'start_end_prod_6',
root/'start_end_prod_7', root/'start_end_prod_8', root/'start_end_prod_9', root/'start_end_prod_10', root/'start_end_prod_11', root/'start_end_prod_intra_2']

pdb_files_all = [f for folder in se_folders for f in folder.glob('**/*.pdb') if (re.search('_1.pdb', f.name) or re.search('_2.pdb', f.name))]

hashes_se = []
for hash1, hash2 in zip(data['hash_u1'], data['hash_u2']):
    hashes_se.append(str(hash1) + '_' + str(hash2) + '_1.pdb')
    hashes_se.append(str(hash1) + '_' + str(hash2) + '_2.pdb')

pdb_files = [f for f in pdb_files_all if f.name in hashes_se]

del hashes_se, data, pdb_files_all, root, se_folders

RDLogger.DisableLog('rdApp.*')
hash_u1 = []
hash_u2 = []
morgan = []
soap_rad = []
soap_H = []
lmbtr_rad = []
lmbtr_H = []


for file in pdb_files:
    print('Computing descriptors for {}'.format(file.name))
    split_hash = file.name.split('_')
    hash_u1.append(split_hash[0])
    hash_u2.append(split_hash[1])

    mol_ase = read(str(file.resolve()))
    mol_rdkit = Chem.MolFromPDBFile(str(file.resolve()))
    H_pos = mol_ase[0].position
    rad_idx = mol_ase.get_distance(0, slice(1, None)).argmin()+1 # find radical center
    rad_pos = mol_ase[rad_idx].position

    # Morgan fingerprint
    morgan_fp = np.asarray(AllChem.GetMorganFingerprintAsBitVect(mol_rdkit, radius=2, nBits=1024))
    morgan.append(morgan_fp)

    ### DScribe descriptors ###
    soap_on_rad = soap.create(mol_ase, [rad_idx,], verbose=False)
    soap_rad.append(soap_on_rad)
    soap_on_H = soap.create(mol_ase, [0,], verbose=False)
    soap_H.append(soap_on_H)

    lmbtr_on_rad = lmbtr.create(mol_ase, [rad_idx,], verbose=False)
    lmbtr_rad.append(lmbtr_on_rad)
    lmbtr_on_H = lmbtr.create(mol_ase, [0,], verbose=False)
    lmbtr_H.append(lmbtr_on_H)

results = pd.DataFrame(
    zip(
        pdb_files, hash_u1, hash_u2, morgan, soap_rad, soap_H, lmbtr_rad, lmbtr_H
    ),
    columns = [
        'pdb_file', 'hash_u1', 'hash_u2', 'morgan', 'soap_rad', 'soap_H', 'lmbtr_rad', 'lmbtr_H'
    ]
)

results.to_pickle('/hits/fast/mbm/treydewk/documentation/docs/Kais_descriptors')

# Append descriptors to dataframe

In [2]:
results = pd.read_pickle('/hits/fast/mbm/treydewk/documentation/docs/Kais_descriptors')

In [3]:
results.head()

Unnamed: 0,pdb_file,hash_u1,hash_u2,morgan,soap_rad,soap_H,lmbtr_rad,lmbtr_H
0,/hits/basement/mbm/riedmiki/structures/KR0008/...,8726767316812,8726818984343,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.004342754672521798, 0.02600148517465986, 0...","[[0.010936292036474388, 0.05792312017327755, 0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,/hits/basement/mbm/riedmiki/structures/KR0008/...,8726776569116,8726776668654,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.00017437889221453973, 0.001306433281555183...","[[0.013685243226765387, 0.06801057489707658, 0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,/hits/basement/mbm/riedmiki/structures/KR0008/...,8760393178456,8760393261818,"[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.00025711812753723476, 0.000637069180481320...","[[0.014950490025134197, 0.07668944826783508, 0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,/hits/basement/mbm/riedmiki/structures/KR0008/...,8726775242813,8726806149759,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.0001086005620913891, 0.0012524841966032629...","[[0.010191546731131366, 0.05507533053555323, 0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,/hits/basement/mbm/riedmiki/structures/KR0008/...,8760368240406,8760348996371,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.001229423842431432, 0.005226441764633402, ...","[[0.010963342562624163, 0.05733634179875371, 0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [4]:
results.shape

(6150, 8)

In [35]:
data = pd.read_pickle('data_complete_w_descriptors')
data.head()

Unnamed: 0,translation,rotation,rad_name,h_name,central_atom_idx_1,central_atom_idx_2,rad_charge,h_charge,hash_u1,hash_u2,...,SRW10_H,TSRW10_H,MW_H,AMW_H,WPath_H,WPol_H,Zagreb1_H,Zagreb2_H,mZagreb1_H,mZagreb2_H
3,0.80351,270.0,Hyl-amid,Thr-amid,23.0,7.0,0.0,0.0,8754994733906,8754977793765,...,8.814776,41.369074,174.100442,6.696171,204.0,14.0,50.0,53.0,6.944444,2.888889
5,1.994497,60.0,Hyl-amid,Thr-amid,17.0,7.0,0.0,0.0,8754996169244,8754981509455,...,8.814776,41.369074,174.100442,6.696171,204.0,14.0,50.0,53.0,6.944444,2.888889
7,2.01019,240.0,Hyl-amid,Thr-amid,12.0,7.0,0.0,0.0,8755001596395,8754978356529,...,8.814776,41.369074,174.100442,6.696171,204.0,14.0,50.0,53.0,6.944444,2.888889
8,1.34234,120.0,Hyl-amid,Thr-amid,15.0,7.0,0.0,0.0,8754996669706,8754981789453,...,8.814776,41.369074,174.100442,6.696171,204.0,14.0,50.0,53.0,6.944444,2.888889
14,2.014465,150.0,Hyl-amid,Val-amid,17.0,9.0,0.0,0.0,8754996178758,8754978855988,...,8.814776,41.369074,172.121178,6.147185,204.0,14.0,50.0,53.0,6.944444,2.888889


In [6]:
data.shape

(6150, 2854)

In [7]:
# add SMILES
RDLogger.DisableLog('rdApp.*')
smiles_all = []
for pdb in results['pdb_file']:
    path = str(pdb.resolve())
    m = Chem.MolFromPDBFile(path)
    smiles = Chem.MolToSmiles(m)    
    smiles_all.append(smiles)

In [8]:
smiles_df = pd.DataFrame(smiles_all, columns = ['SMILES_sys'])

In [9]:
results = results.join(smiles_df)

In [20]:
# mordred descriptors among the 20 best descriptors as determined by univariate feature selection
best = [
    'nBase_rad', 'SpMax_A_rad', 'ATSC2s_rad', 'ATSC1Z_rad',
    'ATSC2i_rad', 'NdNH_rad', 'SMR_VSA4_rad', 'max_spin_H', 'nBase_H', 'SpMax_A_H', 'ATSC2s_H',
    'ATSC1Z_H', 'ATSC2i_H', 'GATS2dv_H', 'BCUTdv-1h_H', 'SMR_VSA4_H', 'VSA_EState7_H'
]
# discard mordred descriptors other than those selected by univariate feature selection
to_drop_1 = [col for col in data.columns[73:1460] if not col in best]
to_drop_2 = [col for col in data.columns[1467:] if not col in best]
to_drop = to_drop_1 + to_drop_2
data.drop(to_drop, axis=1, inplace=True)
data.shape

(6150, 96)

In [21]:
hashes_se = []
for hash1, hash2 in zip(data['hash_u1'], data['hash_u2']):
    hashes_se.append([str(hash1) + '_' + str(hash2) + '_1', str(hash1) + '_' + str(hash2) + '_2'])
directions = data['reaction'].to_list()

In [22]:
stems = []
for file in results['pdb_file']:
    stems.append(file.stem)
stems_df = pd.DataFrame(stems, columns = ['stems'])
results = results.join(stems_df)

In [23]:
descriptors_sorted_s = pd.DataFrame()
descriptors_sorted_e = pd.DataFrame()

for hash, direction in zip(hashes_se, directions):
    if direction == 1:
        idx = results[results['stems'] == hash[0]].index[0]
        descriptors_sorted_s = descriptors_sorted_s.append(results.iloc[idx])
        idx = results[results['stems'] == hash[1]].index[0]
        descriptors_sorted_e = descriptors_sorted_e.append(results.iloc[idx])

    if direction == 2:
        idx = results[results['stems'] == hash[1]].index[0]
        descriptors_sorted_s = descriptors_sorted_s.append(results.iloc[idx])
        idx = results[results['stems'] == hash[0]].index[0]
        descriptors_sorted_e = descriptors_sorted_e.append(results.iloc[idx])

In [24]:
descriptors_sorted_s.head()

Unnamed: 0,pdb_file,hash_u1,hash_u2,morgan,soap_rad,soap_H,lmbtr_rad,lmbtr_H,SMILES_sys,stems
5668,/hits/basement/mbm/riedmiki/structures/KR0008/...,8754994733906,8754977793765,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.00046770291116853604, 0.00333426418339619,...","[[0.009127191599907348, 0.047554903522187456, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(=O)C(N)C(N)=O.NCC(O)CCC(N)C(N)O,8754994733906_8754977793765_1
5670,/hits/basement/mbm/riedmiki/structures/KR0008/...,8754996169244,8754981509455,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.0006780453020492238, 0.004709392772089226,...","[[0.01223500278873003, 0.06478293084052474, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(=O)C(N)C(N)=O.NCC(O)CCC(N)C(N)O,8754996169244_8754981509455_1
5667,/hits/basement/mbm/riedmiki/structures/KR0008/...,8755001596395,8754978356529,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.0005291969804432233, 0.003657848454214293,...","[[0.012130910326353418, 0.06257216987162503, 0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(=O)C(N)C(N)=O.NCC(O)CCC(N)C(N)O,8755001596395_8754978356529_1
5666,/hits/basement/mbm/riedmiki/structures/KR0008/...,8754996669706,8754981789453,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.0007258224761713728, 0.004744811430432084,...","[[0.010233072869219931, 0.05320444935454681, 0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(=O)C(N)C(N)=O.NCC(O)CCC(N)C(N)O,8754996669706_8754981789453_1
5673,/hits/basement/mbm/riedmiki/structures/KR0008/...,8754996178758,8754978855988,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.0006048558320628348, 0.004466705202394738,...","[[0.005639131332280367, 0.028935711138589254, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(C)C(N)C(N)=O.NCC(O)CCC(N)C(N)O,8754996178758_8754978855988_1


In [25]:
descriptors_sorted_s.drop(columns = ['hash_u1', 'hash_u2', 'stems'], inplace = True)
descriptors_sorted_e.drop(columns = ['hash_u1', 'hash_u2', 'stems'], inplace = True)
descriptors_sorted_s = descriptors_sorted_s.reset_index(drop=True)
descriptors_sorted_e = descriptors_sorted_e.reset_index(drop=True)

In [26]:
descriptors_sorted_s.head()

Unnamed: 0,pdb_file,morgan,soap_rad,soap_H,lmbtr_rad,lmbtr_H,SMILES_sys
0,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.00046770291116853604, 0.00333426418339619,...","[[0.009127191599907348, 0.047554903522187456, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(=O)C(N)C(N)=O.NCC(O)CCC(N)C(N)O
1,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.0006780453020492238, 0.004709392772089226,...","[[0.01223500278873003, 0.06478293084052474, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(=O)C(N)C(N)=O.NCC(O)CCC(N)C(N)O
2,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.0005291969804432233, 0.003657848454214293,...","[[0.012130910326353418, 0.06257216987162503, 0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(=O)C(N)C(N)=O.NCC(O)CCC(N)C(N)O
3,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.0007258224761713728, 0.004744811430432084,...","[[0.010233072869219931, 0.05320444935454681, 0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(=O)C(N)C(N)=O.NCC(O)CCC(N)C(N)O
4,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.0006048558320628348, 0.004466705202394738,...","[[0.005639131332280367, 0.028935711138589254, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(C)C(N)C(N)=O.NCC(O)CCC(N)C(N)O


In [27]:
for column in descriptors_sorted_s.columns:
    descriptors_sorted_s.rename(columns = {column: '{}_start'.format(column)}, inplace = True)
for column in descriptors_sorted_e.columns:
    descriptors_sorted_e.rename(columns = {column: '{}_end'.format(column)}, inplace = True)
descriptors_sorted = descriptors_sorted_s.join(descriptors_sorted_e)
descriptors_sorted.head()

Unnamed: 0,pdb_file_start,morgan_start,soap_rad_start,soap_H_start,lmbtr_rad_start,lmbtr_H_start,SMILES_sys_start,pdb_file_end,morgan_end,soap_rad_end,soap_H_end,lmbtr_rad_end,lmbtr_H_end,SMILES_sys_end
0,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.00046770291116853604, 0.00333426418339619,...","[[0.009127191599907348, 0.047554903522187456, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(=O)C(N)C(N)=O.NCC(O)CCC(N)C(N)O,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.00046822964064590624, 0.003321528911214239...","[[0.008900222204147426, 0.04679743876133917, 0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(=O)C(N)C(N)=O.NCC(O)CCC(N)C(N)O
1,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.0006780453020492238, 0.004709392772089226,...","[[0.01223500278873003, 0.06478293084052474, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(=O)C(N)C(N)=O.NCC(O)CCC(N)C(N)O,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.0004517923584980096, 0.003298088086730262,...","[[0.010480681134706005, 0.05444424966845287, 0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(=O)C(N)C(N)=O.NCC(O)CCC(N)C(N)O
2,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.0005291969804432233, 0.003657848454214293,...","[[0.012130910326353418, 0.06257216987162503, 0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(=O)C(N)C(N)=O.NCC(O)CCC(N)C(N)O,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.0005116912834079438, 0.003398872204448246,...","[[0.01060160088240023, 0.05319576884389037, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(=O)C(N)C(N)=O.NCC(O)CCC(N)C(N)O
3,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.0007258224761713728, 0.004744811430432084,...","[[0.010233072869219931, 0.05320444935454681, 0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(=O)C(N)C(N)=O.NCC(O)CCC(N)C(N)O,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.00046652448267147707, 0.003328962806179, 0...","[[0.007284088205742367, 0.03837221176115919, 0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(=O)C(N)C(N)=O.NCC(O)CCC(N)C(N)O
4,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.0006048558320628348, 0.004466705202394738,...","[[0.005639131332280367, 0.028935711138589254, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(C)C(N)C(N)=O.NCC(O)CCC(N)C(N)O,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.0003923056769002688, 0.0031013514247140513...","[[0.010260228230235542, 0.052228494427901534, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(C)C(N)C(N)=O.NCC(O)CCC(N)C(N)O


In [28]:
indices = data.index
descriptors_sorted = descriptors_sorted.set_index(indices)
descriptors_sorted.head()

Unnamed: 0,pdb_file_start,morgan_start,soap_rad_start,soap_H_start,lmbtr_rad_start,lmbtr_H_start,SMILES_sys_start,pdb_file_end,morgan_end,soap_rad_end,soap_H_end,lmbtr_rad_end,lmbtr_H_end,SMILES_sys_end
3,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.00046770291116853604, 0.00333426418339619,...","[[0.009127191599907348, 0.047554903522187456, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(=O)C(N)C(N)=O.NCC(O)CCC(N)C(N)O,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.00046822964064590624, 0.003321528911214239...","[[0.008900222204147426, 0.04679743876133917, 0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(=O)C(N)C(N)=O.NCC(O)CCC(N)C(N)O
5,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.0006780453020492238, 0.004709392772089226,...","[[0.01223500278873003, 0.06478293084052474, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(=O)C(N)C(N)=O.NCC(O)CCC(N)C(N)O,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.0004517923584980096, 0.003298088086730262,...","[[0.010480681134706005, 0.05444424966845287, 0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(=O)C(N)C(N)=O.NCC(O)CCC(N)C(N)O
7,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.0005291969804432233, 0.003657848454214293,...","[[0.012130910326353418, 0.06257216987162503, 0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(=O)C(N)C(N)=O.NCC(O)CCC(N)C(N)O,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.0005116912834079438, 0.003398872204448246,...","[[0.01060160088240023, 0.05319576884389037, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(=O)C(N)C(N)=O.NCC(O)CCC(N)C(N)O
8,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.0007258224761713728, 0.004744811430432084,...","[[0.010233072869219931, 0.05320444935454681, 0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(=O)C(N)C(N)=O.NCC(O)CCC(N)C(N)O,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.00046652448267147707, 0.003328962806179, 0...","[[0.007284088205742367, 0.03837221176115919, 0...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(=O)C(N)C(N)=O.NCC(O)CCC(N)C(N)O
14,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.0006048558320628348, 0.004466705202394738,...","[[0.005639131332280367, 0.028935711138589254, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(C)C(N)C(N)=O.NCC(O)CCC(N)C(N)O,/hits/basement/mbm/riedmiki/structures/KR0008/...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.0003923056769002688, 0.0031013514247140513...","[[0.010260228230235542, 0.052228494427901534, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",CC(C)C(N)C(N)=O.NCC(O)CCC(N)C(N)O


In [29]:
descriptors_sorted.shape

(6150, 14)

In [30]:
final_results = data.join(descriptors_sorted)
final_results.shape

(6150, 110)

In [31]:
# Checks
hash1 = final_results['hash_u1'].to_list()
hash2 = final_results['hash_u2'].to_list()
stems_start = [f.stem[:-2] for f in final_results['pdb_file_start']]
stems_end = [f.stem[:-2] for f in final_results['pdb_file_end']]
if ([str(h1) + '_' + str(h2) for h1, h2 in zip(hash1, hash2)] == stems_start):
    print('Check 1: True')
if ([str(h1) + '_' + str(h2) for h1, h2 in zip(hash1, hash2)] == stems_end):
    print('Check 2: True')
if (stems_start == stems_end):
    print('Check 3: True')


Check 1: True
Check 2: True
Check 3: True


In [32]:
final_results.to_pickle('data_complete_final')