In [23]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors, Descriptors3D
from rdkit.ML.Descriptors import MoleculeDescriptors
from tqdm import tqdm

In [24]:
calc = MoleculeDescriptors.MolecularDescriptorCalculator([desc[0] for desc in Chem.Descriptors._descList])

def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None 
    return calc.CalcDescriptors(mol)

In [5]:
#2D RDKit train dataset descriptors
df_train = pd.read_csv("/home/users/akshay/PCPpred/data/Train.csv")
descriptor_data = []
for smiles in df_train['SMILES']:
    descriptors = calculate_descriptors(smiles)
    if descriptors is not None:
        descriptor_data.append(descriptors)
    else:
        descriptor_data.append([np.nan] * len(calc.GetDescriptorNames()))

descriptor_df = pd.DataFrame(descriptor_data, columns=calc.GetDescriptorNames())
train_2d_rdkit = pd.concat([df_train[['ID','SMILES','Permeability']], descriptor_df], axis=1)
train_2d_rdkit.to_csv('/home/users/akshay/PCPpred/Descriptors/Train_2d_RDKit_des.csv',index=False)
print("Shape: ",train_2d_rdkit.shape, '\n')
train_2d_rdkit

Shape:  (5568, 220) 



Unnamed: 0,ID,SMILES,Permeability,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,915,CC[C@H](C)[C@H](NC(=O)[C@@H]1CC(=O)N[C@@H](Cc2...,-7.0,15.738544,15.738544,0.010382,-1.908222,0.047997,24.622047,1773.325,...,0,0,0,0,0,0,0,0,0,0
1,888,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1cccc(Cl)c1)N(C)...,-7.0,15.975705,15.975705,0.027671,-1.943827,0.026511,21.983740,1745.057,...,0,0,0,0,0,0,0,0,0,0
2,593,C/N=C(\NC)NCCC[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C...,-7.0,15.828473,15.828473,0.049834,-1.862294,0.021075,22.464000,1733.267,...,0,0,0,0,0,0,0,0,0,0
3,916,CC[C@H](C)[C@H](NC(=O)[C@@H](NC(=O)[C@@H]1CC(=...,-7.0,15.595105,15.595105,0.004954,-1.811756,0.069603,24.479675,1725.281,...,0,0,0,0,0,0,0,0,0,0
4,900,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)N(C)C(=O...,-7.0,15.867592,15.867592,0.029192,-1.876686,0.046796,23.089431,1723.309,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.7,12.835172,12.835172,0.168936,-0.728726,0.606745,24.965517,402.539,...,0,0,0,0,0,0,0,0,0,0
5564,2467,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-5.6,12.673271,12.673271,0.196704,-0.722720,0.611202,25.037037,374.485,...,0,0,0,0,0,0,0,0,0,0
5565,2512,CC(C)C[C@H]1NC(=O)[C@@H](C)NCCCCCCNC(=O)[C@H](...,-5.5,12.572226,12.572226,0.181773,-1.016186,0.465702,27.576923,370.494,...,0,0,0,0,0,0,0,0,0,0
5566,2511,CC(C)[C@@H]1NC(=O)[C@@H](CO)NC(=O)[C@@H](C)NCC...,-6.3,12.364448,12.364448,0.112529,-1.084532,0.446971,28.200000,356.467,...,0,0,0,0,0,0,0,0,0,0


In [6]:
#2D RDKit test dataset descriptors
df_test = pd.read_csv("/home/users/akshay/PCPpred/data/Test.csv")
descriptor_data = []
for smiles in df_test['SMILES']:
    descriptors = calculate_descriptors(smiles)
    if descriptors is not None:
        descriptor_data.append(descriptors)
    else:
        descriptor_data.append([np.nan] * len(calc.GetDescriptorNames()))

descriptor_df = pd.DataFrame(descriptor_data, columns=calc.GetDescriptorNames())
test_2d_rdkit = pd.concat([df_test[['ID','SMILES','Permeability']], descriptor_df], axis=1)
test_2d_rdkit.to_csv('/home/users/akshay/PCPpred/Descriptors/Test_2d_RDKit_des.csv',index=False)
print("Shape: ",test_2d_rdkit.shape, '\n')
test_2d_rdkit

Shape:  (1392, 220) 



Unnamed: 0,ID,SMILES,Permeability,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,908,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-7.00,15.806942,15.806942,0.022547,-1.946094,0.037676,23.507937,1777.744,...,0,0,0,0,0,0,0,0,0,0
1,923,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](...,-7.00,15.512854,15.512854,0.072929,-1.849230,0.082004,25.902439,1725.281,...,0,0,0,0,0,0,0,0,0,0
2,897,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.00,16.029525,16.029525,0.042154,-1.949385,0.046040,22.827869,1701.218,...,0,0,0,0,0,0,0,0,0,0
3,587,CC(C)C[C@@H]1NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@...,-6.74,15.776936,15.776936,0.046352,-1.865645,0.035370,22.622951,1686.166,...,0,0,0,0,0,0,0,0,0,0
4,921,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.54,15.432735,15.432735,0.040020,-1.845687,0.100132,26.663866,1669.173,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,12.958074,12.958074,0.156212,-0.733388,0.586385,24.903226,430.593,...,0,0,0,0,0,0,0,0,0,0
1388,2485,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@@H](Cc2cccc...,-4.80,12.958074,12.958074,0.156212,-0.733388,0.586385,24.903226,430.593,...,0,0,0,0,0,0,0,0,0,0
1389,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,12.898424,12.898424,0.143657,-0.742398,0.662387,22.258065,430.549,...,0,0,0,0,0,0,0,0,0,0
1390,2513,C[C@H]1NCCCCCCNC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[...,-7.80,13.095577,13.095577,0.065481,-0.737917,0.561202,26.806452,430.549,...,0,0,0,0,0,0,0,0,0,0


In [None]:
from mordred import Calculator, descriptors
calc = Calculator(descriptors, ignore_3D=True)

def calculate_mordred_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None  # Return None if the SMILES is invalid
    return calc(mol)

In [41]:
print('start')
#2D Mordred train descriptors
df_train = pd.read_csv("/home/users/akshay/PCPpred/data/Train.csv")
descriptor_data = []
for smiles in df_train['SMILES']:
    descriptors = calculate_mordred_descriptors(smiles)
    if descriptors is not None:
        descriptor_data.append(descriptors)
    else:
        descriptor_data.append([np.nan] * len(calc.descriptors))

descriptor_df = pd.DataFrame(descriptor_data, columns=[desc.__class__.__name__ for desc in calc.descriptors])
train_mordred_2d = pd.concat([df_train[['ID','SMILES','Permeability']], descriptor_df], axis=1)
train_mordred_2d.to_csv('/home/users/akshay/PCPpred/Descriptors/Train_2d_Mordred_desc.csv', index=False)
print('Shape: ',train_mordred_2d.shape)
train_mordred_2d

start


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


Shape:  (5568, 1616)


Unnamed: 0,ID,SMILES,Permeability,ABCIndex,ABCGGIndex,AcidicGroupCount,BasicGroupCount,AdjacencyMatrix,AdjacencyMatrix.1,AdjacencyMatrix.2,...,WalkCount,WalkCount.1,Weight,Weight.1,WienerIndex,WienerIndex.1,ZagrebIndex,ZagrebIndex.1,ZagrebIndex.2,ZagrebIndex.3
0,915,CC[C@H](C)[C@H](NC(=O)[C@@H]1CC(=O)N[C@@H](Cc2...,-7.0,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,155.712529,2.416852,4.832818,...,11.593980,169.458441,1772.125588,6.420745,107741,214,630.0,725.0,55.173611,28.291667
1,888,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1cccc(Cl)c1)N(C)...,-7.0,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,152.500384,2.445965,4.858291,...,11.590155,178.997365,1742.937365,6.835048,106182,208,620.0,719.0,49.861111,27.277778
2,593,C/N=C(\NC)NCCC[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C...,-7.0,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,3,154.777488,2.415089,4.830179,...,11.523579,167.253791,1732.084392,6.511595,102214,204,616.0,705.0,50.027778,28.083333
3,916,CC[C@H](C)[C@H](NC(=O)[C@@H](NC(=O)[C@@H]1CC(=...,-7.0,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,149.378369,2.419912,4.839823,...,11.562315,165.286866,1724.125588,6.338697,101212,208,608.0,699.0,55.673611,27.319444
4,900,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)N(C)C(=O...,-7.0,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,150.261697,2.429936,4.859872,...,11.591302,165.339859,1722.146324,6.285206,107844,215,608.0,706.0,56.194444,27.555556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.7,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,35.896500,2.304816,4.609632,...,9.859065,63.474076,402.263091,6.385128,2286,42,138.0,153.0,10.638889,6.583333
5564,2467,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-5.6,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,33.357653,2.311467,4.622934,...,9.832367,61.242657,374.231791,6.565470,1880,40,130.0,145.0,10.138889,6.083333
5565,2512,CC(C)C[C@H]1NC(=O)[C@@H](C)NCCCCCCNC(=O)[C@H](...,-5.5,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,31.502840,2.295993,4.591987,...,9.633842,59.559229,370.258006,6.170967,1648,37,118.0,129.0,10.777778,6.083333
5566,2511,CC(C)[C@@H]1NC(=O)[C@@H](CO)NC(=O)[C@@H](C)NCC...,-6.3,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,30.506102,2.303541,4.607082,...,9.648660,58.482085,356.242356,6.249866,1472,37,114.0,126.0,10.527778,5.861111


In [42]:
#2D Mordred test descriptors
df_test = pd.read_csv("/home/users/akshay/PCPpred/data/Test.csv")
descriptor_data = []
for smiles in df_test['SMILES']:
    descriptors = calculate_mordred_descriptors(smiles)
    if descriptors is not None:
        descriptor_data.append(descriptors)
    else:
        descriptor_data.append([np.nan] * len(calc.descriptors))

descriptor_df = pd.DataFrame(descriptor_data, columns=[desc.__class__.__name__ for desc in calc.descriptors])
test_mordred_2d = pd.concat([df_test[['ID','SMILES','Permeability']], descriptor_df], axis=1)
test_mordred_2d.to_csv('/home/users/akshay/PCPpred/Descriptors/Test_2d_Mordred_desc.csv', index=False)
print('Shape: ',test_mordred_2d.shape)
test_mordred_2d

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


Shape:  (1392, 1616)


Unnamed: 0,ID,SMILES,Permeability,ABCIndex,ABCGGIndex,AcidicGroupCount,BasicGroupCount,AdjacencyMatrix,AdjacencyMatrix.1,AdjacencyMatrix.2,...,WalkCount,WalkCount.1,Weight,Weight.1,WienerIndex,WienerIndex.1,ZagrebIndex,ZagrebIndex.1,ZagrebIndex.2,ZagrebIndex.3
0,908,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-7.00,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,155.498239,2.424408,4.847496,...,11.617204,168.482860,1776.076051,6.529691,111779,220,626.0,728.0,55.444444,28.277778
1,923,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](...,-7.00,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,150.138684,2.427131,4.854261,...,11.589822,165.351205,1724.125588,6.338697,99910,213,610.0,706.0,56.284722,27.402778
2,897,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.00,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,150.676716,2.420176,4.840351,...,11.578086,164.308833,1700.068073,6.464137,103927,212,606.0,704.0,53.222222,27.333333
3,587,CC(C)C[C@@H]1NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@...,-6.74,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,151.698924,2.415398,4.830623,...,11.527095,177.329996,1685.010893,6.633901,95572,199,610.0,701.0,47.777778,27.083333
4,921,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.54,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,145.614560,2.422204,4.844100,...,11.547955,161.156358,1668.062988,6.415627,93670,207,588.0,682.0,53.972222,26.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,38.436032,2.302525,4.605049,...,9.885069,65.694305,430.294391,6.236151,2750,44,146.0,161.0,11.138889,7.083333
1388,2485,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@@H](Cc2cccc...,-4.80,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,38.436032,2.302525,4.605049,...,9.885069,65.694305,430.294391,6.236151,2750,44,146.0,161.0,11.138889,7.083333
1389,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,37.855097,2.317338,4.629714,...,9.940542,65.841858,430.258006,6.619354,2644,45,148.0,164.0,11.750000,7.000000
1390,2513,C[C@H]1NCCCCCCNC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[...,-7.80,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,39.745462,2.403441,4.738314,...,10.079414,79.819128,430.258006,6.619354,2650,47,154.0,176.0,10.250000,6.972222


In [48]:
#RDKit 3d descriptors
def generate_3d_descriptors(smiles):
    # Convert SMILES to a molecule object
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        print(f"Invalid SMILES: {smiles}")
        return None
    
    # Add hydrogens
    mol = Chem.AddHs(mol)
    
    # Generate 3D coordinates for the molecule
    AllChem.EmbedMolecule(mol)
    
    try:
        descriptors = Descriptors3D.CalcMolDescriptors3D(mol)
        return descriptors
    except Exception as e:
        print(f"Error calculating descriptors for SMILES '{smiles}': {e}")
        return None

In [49]:
df_train = pd.read_csv("/home/users/akshay/PCPpred/data/Train.csv")
descriptor_data = []
for smiles in tqdm(df_train['SMILES'],desc='3d_descriptors', unit='smiles'):
    descriptors =generate_3d_descriptors(smiles) 
    if descriptors is not None:
        descriptor_data.append(descriptors)
    else:
        descriptor_data.append({'PMI1': np.nan,
  'PMI2': np.nan,
  'PMI3': np.nan,
  'NPR1': np.nan,
  'NPR2': np.nan,
  'RadiusOfGyration': np.nan,
  'InertialShapeFactor': np.nan,
  'Eccentricity': np.nan,
  'Asphericity': np.nan,
  'SpherocityIndex': np.nan,
  'PBF': np.nan})

3d_descriptors:   0%|          | 2/5568 [00:11<7:38:12,  4.94s/smiles] 

Error calculating descriptors for SMILES 'CC[C@H](C)[C@H](NC(=O)[C@H](Cc1cccc(Cl)c1)N(C)C(=O)[C@H](CC(C)C)NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)N(C)C(=O)[C@@H]1CC(=O)N[C@@H](Cc2ccccc2)C(=O)N(C)[C@@H](Cc2ccccc2)C(=O)N2CCC[C@H]2C(=O)N[C@@H](CC(C)C)C(=O)N(C)[C@@H](Cc2cccc(Cl)c2)C(=O)N[C@@H]([C@@H](C)O)C(=O)N[C@@H](CC(C)C)C(=O)N1)C(=O)N1CCCCC1': Computing 3D Descriptors requires a structure with at least 1 conformer


3d_descriptors:   0%|          | 5/5568 [00:42<13:12:40,  8.55s/smiles]

Error calculating descriptors for SMILES 'CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)N(C)C(=O)[C@@H](NC(=O)[C@H](C)NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@@H]1CC(=O)N(C)[C@@H](C)C(=O)N(C)[C@@H](CC(C)C)C(=O)N(C)[C@@H](Cc2ccccc2)C(=O)N[C@@H]([C@@H](C)O)C(=O)N(C)[C@@H](CC(C)C)C(=O)N(C)[C@@H](CC(C)C)C(=O)N[C@@H]([C@@H](C)CC)C(=O)N(C)[C@@H](CC(C)C)C(=O)N1)C(C)C)C(=O)N1CCCCC1': Computing 3D Descriptors requires a structure with at least 1 conformer


3d_descriptors:   0%|          | 12/5568 [01:57<16:32:30, 10.72s/smiles]

Error calculating descriptors for SMILES 'CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@H](CC(C)C)NC(=O)[C@H](C)N(C)C(=O)[C@@H]1CC(=O)N[C@H](C(C)C)C(=O)N(C)[C@@H](CC(C)C)C(=O)N(C)[C@@H](CC(C)C)C(=O)N[C@@H](C)C(=O)N[C@@H](Cc2ccccc2)C(=O)N[C@@H]([C@@H](C)O)C(=O)N[C@@H](CC(C)C)C(=O)N(C)[C@@H](Cc2ccccc2)C(=O)N(C)[C@@H](CC(C)C)C(=O)N1)C(=O)N1CCCCC1': Computing 3D Descriptors requires a structure with at least 1 conformer


3d_descriptors:   0%|          | 19/5568 [03:01<11:19:55,  7.35s/smiles]

Error calculating descriptors for SMILES 'CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@@H]1CCCN1C(=O)[C@H](Cc1ccccc1)N(C)C(=O)[C@@H]1CC(=O)N(C)CCC(=O)N[C@@H](Cc2ccccc2)C(=O)N(C)[C@@H](CC(C)C)C(=O)N[C@@H]([C@@H](C)O)C(=O)N[C@@H](C(C)C)C(=O)NCC(=O)N(C)[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N(C)[C@@H](CC(C)C)C(=O)N1)C(=O)N1CCCCC1': Computing 3D Descriptors requires a structure with at least 1 conformer


3d_descriptors:   1%|          | 55/5568 [08:24<16:21:16, 10.68s/smiles]

Error calculating descriptors for SMILES 'CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@H](CC(C)C)N(C)C(=O)[C@H](Cc1cccc(Cl)c1)N(C)C(=O)[C@@H]1CC(=O)N(C)[C@@H](C)C(=O)N[C@@H](C(C)C)C(=O)N(C)[C@@H](Cc2ccccc2)C(=O)N[C@@H]([C@@H](C)O)C(=O)N2CCC[C@H]2C(=O)N(C)[C@@H](Cc2ccccc2)C(=O)N[C@@H](CC(C)C)C(=O)N1)C(=O)N1CCCCC1': Computing 3D Descriptors requires a structure with at least 1 conformer


3d_descriptors:   1%|▏         | 75/5568 [10:51<9:48:31,  6.43s/smiles] 

Error calculating descriptors for SMILES 'CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@H](Cc1cccc(Cl)c1)N(C)C(=O)[C@@H](NC(=O)[C@@H]1CC(=O)N[C@@H](Cc2ccccc2)C(=O)N[C@@H]([C@@H](C)O)C(=O)N[C@@H](CC(C)C)C(=O)N(C)[C@@H](CC(C)C)C(=O)N(C)[C@@H](C)C(=O)N(C)[C@@H](CC(C)C)C(=O)N1)[C@@H](C)CC)C(=O)N1CCCCC1': Computing 3D Descriptors requires a structure with at least 1 conformer


3d_descriptors:   2%|▏         | 92/5568 [12:56<11:20:51,  7.46s/smiles]

Error calculating descriptors for SMILES 'CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H]1CCCN1C(=O)[C@H](CC(C)C)NC(=O)[C@@H]1CC(=O)N[C@@H](Cc2ccccc2)C(=O)N(C)[C@@H](Cc2ccccc2)C(=O)N(C)[C@@H](CC(C)C)C(=O)N[C@@H]([C@@H](C)O)C(=O)N(C)[C@@H](C)C(=O)N(C)[C@@H](CC(C)C)C(=O)N1)C(=O)N1CCCCC1': Computing 3D Descriptors requires a structure with at least 1 conformer


3d_descriptors:   2%|▏         | 127/5568 [16:56<7:07:01,  4.71s/smiles] 

Error calculating descriptors for SMILES 'CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@H](C)N(C)C(=O)[C@@H](NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@@H]1CC(=O)N[C@H](C(C)C)C(=O)N(C)[C@@H](C)C(=O)N(C)[C@@H](Cc2cccc(Cl)c2)C(=O)N(C)[C@@H](CC(C)C)C(=O)N[C@@H]([C@@H](C)O)C(=O)N(C)[C@@H](CC(C)C)C(=O)N1)[C@@H](C)CC)C(=O)N1CCCCC1': Computing 3D Descriptors requires a structure with at least 1 conformer


3d_descriptors:   2%|▏         | 128/5568 [16:58<5:45:11,  3.81s/smiles]

Error calculating descriptors for SMILES 'CC[C@H](C)[C@@H]1NC(=O)[C@H](C(C)C)NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C@H](C(C)C)NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@H](C)NC(=O)C[C@@H](C(=O)N(C)[C@@H](Cc2ccccc2)C(=O)N(C)[C@@H](CC(C)C)C(=O)N[C@H](C(=O)N(C)[C@@H](CC(C)C)C(=O)N[C@H](C(=O)N2CCCCC2)[C@@H](C)CC)[C@@H](C)O)NC1=O': Computing 3D Descriptors requires a structure with at least 1 conformer


3d_descriptors:   3%|▎         | 147/5568 [18:29<5:12:54,  3.46s/smiles]

Error calculating descriptors for SMILES 'CC[C@H](C)[C@H](NC(=O)CN(C)C(=O)[C@H](Cc1ccccc1)N(C)C(=O)[C@H](CC(C)C)N(C)C(=O)[C@@H](NC(=O)[C@@H]1CC(=O)N(C)[C@@H](C)C(=O)N[C@@H](CC(C)C)C(=O)N(C)[C@@H](CC(C)C)C(=O)N(C)[C@@H](C)C(=O)N(C)[C@@H](Cc2cccc(Cl)c2)C(=O)N[C@@H]([C@@H](C)O)C(=O)N1)C(C)C)C(=O)N1CCCCC1': Computing 3D Descriptors requires a structure with at least 1 conformer


3d_descriptors: 100%|██████████| 5568/5568 [2:03:57<00:00,  1.34s/smiles]  


In [50]:
descriptor_df = pd.DataFrame(descriptor_data)
descriptor_df

Unnamed: 0,PMI1,PMI2,PMI3,NPR1,NPR2,RadiusOfGyration,InertialShapeFactor,Eccentricity,Asphericity,SpherocityIndex,PBF
0,57271.392656,84706.704866,131312.284498,0.436146,0.645078,8.778150,0.000011,0.899876,0.225120,0.140103,1.650689
1,,,,,,,,,,,
2,58101.718060,69316.298095,115939.041133,0.501140,0.597868,8.378654,0.000010,0.865366,0.190623,0.182536,1.783029
3,49788.948788,91676.720195,127366.763818,0.390910,0.719785,8.826650,0.000014,0.920429,0.250355,0.193597,1.907716
4,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
5563,1891.284732,5055.635080,6387.678814,0.296083,0.791467,4.069781,0.000418,0.955162,0.359988,0.161207,0.888675
5564,1171.539745,4912.089011,5395.951856,0.217115,0.910329,3.914991,0.000777,0.976146,0.486740,0.208979,0.970223
5565,2179.710197,3127.136956,4764.692067,0.457471,0.656315,3.686739,0.000301,0.889224,0.202322,0.186896,0.863298
5566,1873.085082,2859.925158,4153.290896,0.450988,0.688593,3.530497,0.000368,0.892530,0.198717,0.271256,1.003594


In [51]:
num_columns = descriptor_df.shape[1]
descriptor_df.columns = [f'3d_rdkit_{i+1}' for i in range(num_columns)]
train_3d_rdkit = pd.concat([df_train[['ID','SMILES','Permeability']], descriptor_df], axis=1)
print('Shape before:' , train_3d_rdkit.shape)
train_3d_rdkit.to_csv('/home/users/akshay/PCPpred/Descriptors/Train_3d_RDKit_desc.csv',index=False)
print('Shape after: ',train_3d_rdkit.shape)
train_3d_rdkit

Shape before: (5568, 14)
Shape after:  (5568, 14)


Unnamed: 0,ID,SMILES,Permeability,3d_rdkit_1,3d_rdkit_2,3d_rdkit_3,3d_rdkit_4,3d_rdkit_5,3d_rdkit_6,3d_rdkit_7,3d_rdkit_8,3d_rdkit_9,3d_rdkit_10,3d_rdkit_11
0,915,CC[C@H](C)[C@H](NC(=O)[C@@H]1CC(=O)N[C@@H](Cc2...,-7.0,57271.392656,84706.704866,131312.284498,0.436146,0.645078,8.778150,0.000011,0.899876,0.225120,0.140103,1.650689
1,888,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1cccc(Cl)c1)N(C)...,-7.0,,,,,,,,,,,
2,593,C/N=C(\NC)NCCC[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C...,-7.0,58101.718060,69316.298095,115939.041133,0.501140,0.597868,8.378654,0.000010,0.865366,0.190623,0.182536,1.783029
3,916,CC[C@H](C)[C@H](NC(=O)[C@@H](NC(=O)[C@@H]1CC(=...,-7.0,49788.948788,91676.720195,127366.763818,0.390910,0.719785,8.826650,0.000014,0.920429,0.250355,0.193597,1.907716
4,900,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)N(C)C(=O...,-7.0,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.7,1891.284732,5055.635080,6387.678814,0.296083,0.791467,4.069781,0.000418,0.955162,0.359988,0.161207,0.888675
5564,2467,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-5.6,1171.539745,4912.089011,5395.951856,0.217115,0.910329,3.914991,0.000777,0.976146,0.486740,0.208979,0.970223
5565,2512,CC(C)C[C@H]1NC(=O)[C@@H](C)NCCCCCCNC(=O)[C@H](...,-5.5,2179.710197,3127.136956,4764.692067,0.457471,0.656315,3.686739,0.000301,0.889224,0.202322,0.186896,0.863298
5566,2511,CC(C)[C@@H]1NC(=O)[C@@H](CO)NC(=O)[C@@H](C)NCC...,-6.3,1873.085082,2859.925158,4153.290896,0.450988,0.688593,3.530497,0.000368,0.892530,0.198717,0.271256,1.003594


In [52]:
df_test = pd.read_csv("/home/users/akshay/PCPpred/data/Test.csv")
descriptor_data = []
for smiles in tqdm(df_test['SMILES'],desc='3d_descriptors', unit='smiles'):
    descriptors = generate_3d_descriptors(smiles)
    if descriptors is not None:
        descriptor_data.append(descriptors)
    else:
        descriptor_data.append({'PMI1': np.nan,
  'PMI2': np.nan,
  'PMI3': np.nan,
  'NPR1': np.nan,
  'NPR2': np.nan,
  'RadiusOfGyration': np.nan,
  'InertialShapeFactor': np.nan,
  'Eccentricity': np.nan,
  'Asphericity': np.nan,
  'SpherocityIndex': np.nan,
  'PBF': np.nan})

descriptor_data

3d_descriptors:   0%|          | 3/1392 [00:12<1:27:50,  3.79s/smiles]

Error calculating descriptors for SMILES 'CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C@H](CC(C)C)N(C)C(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](C)N(C)C(=O)[C@H]([C@@H](C)O)NC(=O)[C@H](C(C)C)NC(=O)[C@@H](C(C)C)NC(=O)C[C@@H](C(=O)N(C)[C@@H](Cc2ccccc2)C(=O)N[C@H](C(=O)N(C)[C@@H](C)C(=O)N(C)[C@@H](CC(C)C)C(=O)N[C@H](C(=O)N2CCCCC2)[C@@H](C)CC)C(C)C)NC1=O': Computing 3D Descriptors requires a structure with at least 1 conformer


3d_descriptors:   1%|          | 8/1392 [00:41<1:41:32,  4.40s/smiles]

Error calculating descriptors for SMILES 'CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@@H](NC(=O)[C@@H]1CCCN1C(=O)[C@@H]1CC(=O)N(C)CCC(=O)N[C@@H](CC(C)C)C(=O)N(C)[C@@H](CC(C)C)C(=O)N[C@@H]([C@@H](C)O)C(=O)N(C)[C@@H](C)C(=O)N[C@@H]([C@@H](C)CC)C(=O)N(C)[C@@H](CC(C)C)C(=O)N(C)[C@@H](CC(C)C)C(=O)N1)[C@@H](C)CC)C(=O)N1CCCCC1': Computing 3D Descriptors requires a structure with at least 1 conformer


3d_descriptors:   4%|▍         | 56/1392 [03:51<59:40,  2.68s/smiles]  

Error calculating descriptors for SMILES 'CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](Cc1ccccc1)N(C)C(=O)[C@@H]1CC(=O)N(C)[C@@H](C)C(=O)N[C@@H](C(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N(C)[C@@H](Cc2ccccc2)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H]([C@@H](C)O)C(=O)N1)C(=O)N1CCCCC1': Computing 3D Descriptors requires a structure with at least 1 conformer


3d_descriptors: 100%|██████████| 1392/1392 [28:15<00:00,  1.22s/smiles] 


[{'PMI1': 34902.4958791359,
  'PMI2': 130685.7259404624,
  'PMI3': 150085.8571419773,
  'NPR1': 0.232550198558143,
  'NPR2': 0.8707397780780712,
  'RadiusOfGyration': 9.422580412145805,
  'InertialShapeFactor': 2.4947779697284747e-05,
  'Eccentricity': 0.9725843948730456,
  'Asphericity': 0.45796227878369633,
  'SpherocityIndex': 0.16018208133065717,
  'PBF': 1.798006492582216},
 {'PMI1': 59935.57914215625,
  'PMI2': 71658.90266720287,
  'PMI3': 111362.46001719117,
  'NPR1': 0.5382027222899343,
  'NPR2': 0.6434744945122511,
  'RadiusOfGyration': 8.391116264013775,
  'InertialShapeFactor': 1.0736102056944291e-05,
  'Eccentricity': 0.8428154185346302,
  'Asphericity': 0.14767660352465456,
  'SpherocityIndex': 0.27961583241545634,
  'PBF': 2.2128428224152072},
 {'PMI1': nan,
  'PMI2': nan,
  'PMI3': nan,
  'NPR1': nan,
  'NPR2': nan,
  'RadiusOfGyration': nan,
  'InertialShapeFactor': nan,
  'Eccentricity': nan,
  'Asphericity': nan,
  'SpherocityIndex': nan,
  'PBF': nan},
 {'PMI1': 4371

In [53]:
descriptor_df = pd.DataFrame(descriptor_data)

In [54]:
num_columns = descriptor_df.shape[1]
print('num_columns',num_columns)
descriptor_df.columns = [f'3d_rdkit_{i+1}' for i in range(num_columns)]
print('descriptor_df.columns',descriptor_df.columns)
test_3d_rdkit = pd.concat([df_test[['ID','SMILES','Permeability']], descriptor_df], axis=1)
print('Shape before:' , test_3d_rdkit.shape)
test_3d_rdkit.to_csv('/home/users/akshay/PCPpred/Descriptors/Test_3d_RDKit_desc.csv',index=False)
print('Shape after: ',test_3d_rdkit.shape)
test_3d_rdkit

num_columns 11
descriptor_df.columns Index(['3d_rdkit_1', '3d_rdkit_2', '3d_rdkit_3', '3d_rdkit_4', '3d_rdkit_5',
       '3d_rdkit_6', '3d_rdkit_7', '3d_rdkit_8', '3d_rdkit_9', '3d_rdkit_10',
       '3d_rdkit_11'],
      dtype='object')
Shape before: (1392, 14)
Shape after:  (1392, 14)


Unnamed: 0,ID,SMILES,Permeability,3d_rdkit_1,3d_rdkit_2,3d_rdkit_3,3d_rdkit_4,3d_rdkit_5,3d_rdkit_6,3d_rdkit_7,3d_rdkit_8,3d_rdkit_9,3d_rdkit_10,3d_rdkit_11
0,908,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-7.00,34902.495879,130685.725940,150085.857142,0.232550,0.870740,9.422580,0.000025,0.972584,0.457962,0.160182,1.798006
1,923,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](...,-7.00,59935.579142,71658.902667,111362.460017,0.538203,0.643474,8.391116,0.000011,0.842815,0.147677,0.279616,2.212843
2,897,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.00,,,,,,,,,,,
3,587,CC(C)C[C@@H]1NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@...,-6.74,43711.954275,73710.637354,99360.782765,0.439932,0.741848,8.017663,0.000017,0.898031,0.198091,0.275576,2.084180
4,921,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.54,57017.133165,78022.452280,122140.702987,0.466815,0.638792,8.777143,0.000011,0.884355,0.200440,0.168561,1.779081
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,2624.539814,5235.905285,7178.427044,0.365615,0.729395,4.178873,0.000278,0.930766,0.277056,0.165034,0.881064
1388,2485,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@@H](Cc2cccc...,-4.80,3393.803325,4465.343773,6048.910610,0.561060,0.738206,4.018692,0.000218,0.827775,0.110689,0.436331,1.318221
1389,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,2350.480932,4746.916538,6169.098467,0.381009,0.769467,3.925110,0.000327,0.924571,0.253947,0.239286,0.995183
1390,2513,C[C@H]1NCCCCCCNC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[...,-7.80,2185.854125,6171.808705,7781.242611,0.280913,0.793165,4.329231,0.000363,0.959733,0.382290,0.137020,0.812082
