In [5]:
!pip install rdkit



In [6]:
from rdkit import Chem
from rdkit.Chem import Descriptors, MolFromSmiles
import pandas as pd

In [7]:
df = pd.read_csv('/content/my_data.csv')

## with help of Julia's code for RDKit

In [8]:
# Assuming you have a DataFrame named 'df' with a column named 'SMILES'

smiles_list = df['smiles'].tolist()
smiles_list[1]
descriptors_list = []

for smiles in smiles_list:
    mol = MolFromSmiles(smiles)
    descriptors_values = {}
    for descriptor in Descriptors.descList:
        value = descriptor[1](mol)
        descriptors_values[descriptor[0]] = value
    descriptors_list.append(descriptors_values)

# Now descriptors_list contains a list of dictionaries,
# each dictionary holds the descriptor values for one compound


In [9]:
descriptors_df = pd.DataFrame(descriptors_list)
descriptors_df

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,10.175139,10.175139,0.300833,-0.845486,0.481081,21.777778,129.115,122.059,129.042593,50,...,0,0,0,0,0,0,0,0,0,0
1,4.854254,4.854254,0.831740,0.831740,0.413811,8.222222,126.199,112.087,126.104465,52,...,0,0,0,0,0,0,0,0,2,0
2,10.577222,10.577222,0.244213,-0.248843,0.514333,46.777778,127.143,118.071,127.063329,50,...,0,0,0,0,0,0,0,0,0,0
3,5.365556,5.365556,0.416065,0.416065,0.530212,9.555556,127.151,118.079,127.085795,50,...,0,0,0,0,0,0,0,0,0,0
4,9.033542,9.033542,0.236690,-0.251481,0.435497,54.222222,124.143,116.079,124.063663,48,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,10.113194,10.113194,0.069444,-0.419352,0.286301,6.875000,111.100,106.060,111.032028,42,...,0,0,0,1,0,0,0,0,0,0
19996,10.231389,10.231389,0.199074,-0.737870,0.478072,9.444444,127.103,122.063,127.038176,48,...,0,0,0,0,0,0,0,0,0,0
19997,7.069931,7.069931,0.052662,0.052662,0.603800,22.666667,127.187,114.083,127.099714,52,...,0,0,0,0,0,0,0,0,0,0
19998,8.713519,8.713519,0.271204,0.271204,0.541783,39.888889,124.183,112.087,124.088815,50,...,0,0,0,1,0,0,0,0,0,0


## with help of Susan's code for RDkit

In [10]:
import numpy as np
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors

# Assuming you have a DataFrame named 'df' with a column named 'smiles'
smiles_list = df['smiles'].tolist()

# Get a list of available descriptor names
descriptor_names = list(rdMolDescriptors.Properties.GetAvailableProperties())
num_descriptors = len(descriptor_names)

# Initialize an empty array to store descriptor values
descriptors_set = np.empty((0, num_descriptors), float)

# Compute descriptors for each molecule
for smiles in smiles_list:
    molecule = Chem.MolFromSmiles(smiles)
    if molecule is not None:  # Check if the molecule is valid
        properties = rdMolDescriptors.Properties()
        descriptors = properties.ComputeProperties(molecule)
        descriptors_set = np.append(descriptors_set, [descriptors], axis=0)

# Now descriptors_set contains the descriptor values for all molecules in the DataFrame


In [11]:

# Create a DataFrame to store descriptor values
df_final1 = pd.DataFrame(descriptors_set, columns=descriptor_names)

# Add the 'smiles' column to df_final
df_final1['smiles'] = smiles_list

# Now df_final contains the descriptor values along with the corresponding SMILES strings

# Save df_final to a CSV file
# df_final.to_csv('descriptor_values.csv', index=False)
df_final1

Unnamed: 0,exactmw,amw,lipinskiHBA,lipinskiHBD,NumRotatableBonds,NumHBD,NumHBA,NumHeavyAtoms,NumAtoms,NumHeteroatoms,...,chi1n,chi2n,chi3n,chi4n,hallKierAlpha,kappa1,kappa2,kappa3,Phi,smiles
0,129.042593,129.115,4.0,1.0,3.0,1.0,4.0,9.0,16.0,4.0,...,2.614120,1.346061,1.346061,0.661663,-0.90,6.223457,2.144233,0.928882,1.482727,C1C(CN1)(C=O)OC=O
1,126.104465,126.199,1.0,0.0,3.0,0.0,1.0,9.0,23.0,1.0,...,3.361137,0.952015,0.952015,0.518283,-0.48,8.520000,7.520000,7.520000,7.118933,CCC#CCCCOC
2,127.063329,127.143,3.0,1.0,0.0,1.0,2.0,9.0,18.0,3.0,...,3.243139,2.025049,2.025049,1.315724,-0.73,5.086451,1.403530,0.676972,0.793220,CC1NC(=O)OC2CC12
3,127.085795,127.151,5.0,3.0,1.0,2.0,5.0,9.0,18.0,5.0,...,2.580316,1.020335,1.020335,0.502436,-1.26,5.869199,1.925197,0.932992,1.255485,CNc1nc(n(n1)C)N
4,124.063663,124.143,3.0,1.0,0.0,1.0,3.0,9.0,17.0,3.0,...,3.111471,2.166863,2.166863,1.573785,-0.59,5.214986,1.282121,0.520627,0.742916,OC1CN2CC2(C1)C#N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,111.032028,111.100,3.0,1.0,3.0,1.0,2.0,8.0,13.0,3.0,...,1.974283,0.506940,0.506940,0.185395,-1.30,6.700000,3.875439,2.912766,3.245680,C#CC(=O)CNC=O
19996,127.038176,127.103,5.0,1.0,0.0,1.0,5.0,9.0,14.0,5.0,...,2.273197,0.840679,0.840679,0.367230,-1.39,5.741406,1.847976,1.042171,1.178887,CN1C=NC(=O)OC1=N
19997,127.099714,127.187,2.0,1.0,2.0,1.0,2.0,9.0,22.0,2.0,...,3.486675,2.111107,2.111107,1.189968,-0.53,6.588064,2.010927,0.933931,1.472013,CCC1(CC)CC(=N)O1
19998,124.088815,124.183,1.0,1.0,2.0,1.0,1.0,9.0,21.0,1.0,...,3.417181,2.206273,2.206273,1.497832,-0.48,6.637371,2.408841,0.951289,1.776485,CCC1C(CO)C1C#C


In [None]:
df_final1.to_csv('regina_data_with_descriptors.csv')

## Mordred

In [None]:
!pip install mordred

Collecting mordred
  Downloading mordred-1.2.0.tar.gz (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.8/128.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting networkx==2.* (from mordred)
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: mordred
  Building wheel for mordred (setup.py) ... [?25l[?25hdone
  Created wheel for mordred: filename=mordred-1.2.0-py3-none-any.whl size=176720 sha256=28be401b15bb455a0e11e2846b6d7cb5583f501ddebc3eee676db31f82eec629
  Stored in directory: /root/.cache/pip/wheels/a7/4f/b8/d4c6591f6ac944aaced7865b349477695f662388ad958743c7
Successfully built mordred
Installing collected packages: networkx, mordred
  Attempting uninstall: networkx
    Found existing installation: networkx 3.2.1
    Uninst

In [None]:
from mordred import Calculator, descriptors
from rdkit import Chem


# Initialize the calculator with all available descriptors
calc = Calculator(descriptors, ignore_3D=True)

# Convert SMILES strings to RDKit molecules
molecules = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]

# Calculate descriptors for each molecule
desc_results = calc.pandas(molecules)

# Print or use the descriptors as needed
desc_results


100%|██████████| 20000/20000 [28:27<00:00, 11.71it/s]


Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,10.556763,2.367358,4.734716,10.556763,1.172974,3.120295,...,9.317310,39.107202,129.042593,8.065162,90,9,42.0,48.0,3.562500,2.250000
1,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,10.627503,1.902113,3.804226,10.627503,1.180834,2.997414,...,7.311218,33.969149,126.104465,5.482803,120,6,30.0,28.0,3.750000,2.500000
2,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,11.362527,2.482157,4.658068,11.362527,1.262503,3.235538,...,9.353661,58.992483,127.063329,7.059074,80,10,50.0,60.0,3.194444,1.888889
3,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,10.860595,2.297009,4.297009,10.860595,1.206733,3.103836,...,8.705994,51.545541,127.085795,7.060322,86,9,42.0,47.0,4.083333,2.111111
4,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,11.497238,2.600596,4.646830,11.497238,1.277471,3.254085,...,9.679156,61.155777,124.063663,7.297863,81,8,52.0,65.0,3.284722,1.958333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,9.409260,1.989044,3.978088,9.409260,1.176158,2.888520,...,7.623153,33.372552,111.032028,8.540925,76,6,28.0,27.0,4.111111,2.166667
19996,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,10.792280,2.245827,4.491654,10.792280,1.199142,3.099448,...,8.806724,37.839725,127.038176,9.074155,84,10,42.0,46.0,4.083333,2.027778
19997,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,10.485281,2.414214,4.828427,10.485281,1.165031,3.130568,...,9.511851,39.613204,127.099714,5.777260,88,9,44.0,51.0,4.173611,2.166667
19998,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,11.328091,2.481194,4.283132,11.328091,1.258677,3.170885,...,9.168372,58.744849,124.088815,5.908991,93,9,42.0,51.0,4.083333,2.333333


In [None]:
df_with_descriptors = pd.concat([df, desc_results], axis=1)

In [None]:
df_with_descriptors.to_csv('regina_data_with_descriptors.csv')

In [None]:
from google.colab import drive
drive.mount('/content/drive')