<a href="https://colab.research.google.com/github/Vanitha-Jain/capstone/blob/Molecular-descriptors_-Unknown/RDKIT_UK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install rdkit-pypi

Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m56.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


In [None]:
!pip install mordred

Collecting mordred
  Downloading mordred-1.2.0.tar.gz (128 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/128.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.8/128.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting networkx==2.* (from mordred)
  Downloading networkx-2.8.8-py3-none-any.whl.metadata (5.1 kB)
Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: mordred
  Building wheel for mordred (setup.py) ... [?25l[?25hdone
  Created wheel for mordred: filename=mordred-1.2.0-py3-none-any.whl size=176718 sha256=1ddaed6bde7bf41ae35ba2912d138ff219104ca18763aa22d249768fc1de1633
  Stored in directory: /root/.cache/pip/wheels/8b/30/0b/84e3f6775306e74cf5957ee4d16b10bf3927dcec44c

In [None]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np
from tqdm import tqdm
import os

In [None]:
dataset=pd.read_excel('/content/Cluster 1 Rank.xlsx')

In [None]:
df=dataset.dropna(subset=['smiles'])

In [None]:
df

Unnamed: 0,Compounds Name,IMPPAT_id,smiles
0,Curcumenol,IMPHY010676,CC(=C1C[C@]23O[C@]1(O)C=C([C@@H]3CC[C@@H]2C)C)C
1,Rutin,IMPHY015047,Oc1cc(O)c2c(c1)oc(c(c2=O)O[C@@H]1O[C@H](CO[C@@...
2,Harmine,IMPHY004632,COc1ccc2c(c1)[nH]c1c2ccnc1C
3,Cycloartanol,IMPHY003952,CC(CCC[C@H]([C@H]1CC[C@@]2([C@]1(C)CC[C@@]13[C...
4,Thymol,IMPHY006550,Cc1ccc(c(c1)O)C(C)C
5,Leurosine,IMPHY000998,CC[C@@]12CN3CCc4c([C@@](C[C@@H]([C@H]2O1)C3)(C...
6,Nevadensin,IMPHY001776,COc1ccc(cc1)c1cc(=O)c2c(o1)c(OC)c(c(c2O)OC)O
7,Corilagin,IMPHY010965,O[C@@H]1[C@H]2COC(=O)c3cc(O)c(c(c3-c3c(C(=O)O[...
8,Bilobol,IMPHY005536,CCCCCC/C=CCCCCCCCc1cc(O)cc(c1)O
9,Amygdaloside,IMPHY003242,OC[C@H]1O[C@@H](OC[C@H]2O[C@@H](OC(c3ccccc3)C#...


In [None]:
def RDkit_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles]
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()

    Mol_descriptors = []
    for mol in mols:
        # add hydrogens to molecules
        mol = Chem.AddHs(mol)
        # Calculate all 200 descriptors for each molecule
        descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors, desc_names

# Split the SMILES into chunks of 100,000 for faster processing
chunk_size = 100000
chunks = [dataset[i:i+chunk_size] for i in range(0, len(dataset), chunk_size)]

total_chunks=len(chunks)
total_time=0

# Check if there is an existing output file
if os.path.isfile('RDkit_descriptors.csv'):
    existing_data = pd.read_excel('RDkit_descriptors.csv', index_col=0)
else:
    existing_data = pd.DataFrame()
# Calculate descriptors for each chunk and concatenate the results
for i, chunk in enumerate(tqdm(chunks, desc='Processing', total=len(chunks))):
    # Check if this chunk has already been processed
    if len(existing_data) >= len(chunk):
        continue
    # Calculate descriptors for this chunk
    descriptors, desc_names = RDkit_descriptors(chunk['smiles'])
    # Convert the descriptors to a dataframe
    df_with_200_descriptors = pd.DataFrame(descriptors, columns=desc_names,)
    # Add the chunk index as a new column
    df_with_200_descriptors['chunk_index'] = i
    # Append the data to the existing data
    existing_data = pd.concat([existing_data, df_with_200_descriptors], axis=0)
    # Save the data after each chunk
    existing_data.to_csv('RDkit_descriptors.csv')

# Save the final data
existing_data.to_csv('RDkit_descriptors.csv')

Processing: 100%|██████████| 1/1 [00:02<00:00,  2.28s/it]


In [1]:
import pandas as pd
input_file = '/content/RDkit_descriptors_UK.csv'

!pip install scikit-learn
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
data = pd.read_csv(input_file)
data



Unnamed: 0.1,Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,chunk_index
0,0,9.101441,-4.348142,9.101441,1.833364,0.652275,234.339,212.163,234.16198,94,...,0,0,0,0,0,0,0,0,0,0
1,1,14.820857,-5.23616,14.820857,1.175701,0.139518,610.521,580.281,610.153385,234,...,0,0,0,0,0,0,0,0,0,0
2,2,8.257457,-3.022891,8.257457,0.287693,0.672864,212.252,200.156,212.094963,80,...,0,0,0,0,0,0,0,0,0,0
3,3,10.651089,-6.728689,10.651089,4.055584,0.467089,428.745,376.329,428.401816,178,...,0,0,0,0,0,0,0,0,0,0
4,4,8.05366,-3.55386,8.05366,0.99821,0.652274,150.221,136.109,150.104465,60,...,0,0,0,0,0,0,0,0,0,0
5,5,17.314783,-6.492123,17.314783,1.067883,0.154082,808.973,752.525,808.404729,314,...,0,0,0,0,0,0,0,0,0,0
6,6,13.378285,-3.391574,13.378285,0.883123,0.750638,344.319,328.191,344.089603,130,...,0,0,0,0,0,0,0,0,0,0
7,7,14.793381,-5.029979,14.793381,1.182634,0.097892,634.455,612.279,634.080614,238,...,0,0,0,0,0,0,0,0,0,0
8,8,8.411618,-4.944314,8.411618,1.197195,0.321408,318.501,284.229,318.25588,130,...,0,0,0,0,0,0,0,0,0,0
9,9,10.069684,-5.1225,10.069684,0.911295,0.217518,457.432,430.216,457.158411,178,...,0,0,0,0,0,0,0,0,0,0


In [2]:
if 'Unnamed: 0' in data.columns:
    data = data.drop(columns=['Unnamed: 0'])

In [3]:
def discretize_column(col, bins=3):
    """Discretize a numerical column into equal-width bins."""
    return pd.cut(col, bins=bins, labels=False)

In [4]:
discretized_data = data.copy()
for column in discretized_data.select_dtypes(include=['float64', 'int64']).columns:
    discretized_data[column] = discretize_column(discretized_data[column])

In [5]:
encoder = OneHotEncoder(sparse_output=False)
encoded_data = encoder.fit_transform(discretized_data)

In [6]:
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(discretized_data.columns))

In [7]:
output_file = "onehot_encoded_rdkit_descriptors.csv"
encoded_df.to_csv(output_file, index=False)

print(f"One-hot encoded data saved to {output_file}")

One-hot encoded data saved to onehot_encoded_rdkit_descriptors.csv
