<a href="https://colab.research.google.com/github/Nyillu/Algorithms_Big_Data_Zavadskaya/blob/main/get_descriptors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading descriptors

To import descriptors, two libraries were used: mordered and rdkit.

This code is made to be run locally within current repository. If you work in Google Colab, adjust the path to the `data.csv` file accordingly and install the libraries provided in the requirements.


The **result**  of this notebook is file `data_with_descriptors.csv`.


# Requirements

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.DataStructs.cDataStructs import ConvertToNumpyArray
from rdkit.Chem import Descriptors
from mordred import Calculator, descriptors

In [None]:
df = pd.read_csv('data.csv')

# RDKit descriptors

In [None]:
descriptors_rdkit = []

for smile in df['smiles']:
  mol = Chem.MolFromSmiles(smile)
  if mol is not None:
    descriptor_values = [d(mol) for _, d in Descriptors._descList]
    descriptors_rdkit.append(descriptor_values)
  else:
    descriptors_rdkit.append([None] * len(Descriptors._descList))

descriptor_names = [x[0] for x in Descriptors._descList]

df_rdkit = pd.DataFrame(descriptors_rdkit, columns=descriptor_names)
df_rdkit = pd.concat([df, df_rdkit], axis=1)

In [None]:
df_rdkit

Unnamed: 0,mol_id,smiles,A,B,C,mu,alpha,homo,lumo,gap,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,gdb_89835,OC1CC2(CC2)CC1=O,3.19334,1.22630,0.98880,2.7195,75.40,-0.2512,-0.0284,0.2229,...,0,0,0,0,0,0,0,0,0,0
1,gdb_36873,C1CC1NC2=NCCO2,4.92517,0.92431,0.81835,1.3423,77.99,-0.2144,0.0643,0.2788,...,0,0,0,0,0,0,0,0,0,0
2,gdb_19320,C#CC12CC1C1OC21,4.99590,1.69319,1.50919,1.4352,68.32,-0.2249,0.0250,0.2499,...,0,0,0,1,0,0,0,0,0,0
3,gdb_96404,CC1OCOCC1C#C,2.07981,1.71184,1.20167,1.8563,76.89,-0.2512,0.0501,0.3013,...,0,0,0,1,0,0,0,0,0,0
4,gdb_44243,N=C1OC2C1CCC2=O,3.22781,1.45953,1.19584,1.8143,69.64,-0.2506,-0.0448,0.2058,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,gdb_67709,CC12C3COC1OCC23,2.25194,1.97852,1.70017,2.7998,73.16,-0.2496,0.0810,0.3306,...,0,0,0,0,0,0,0,0,0,0
19996,gdb_23674,c1c(ncnc1F)F,4.22771,1.81684,1.27074,0.9724,50.29,-0.2919,-0.0473,0.2447,...,0,0,0,0,0,0,0,0,0,0
19997,gdb_45091,O=C1CC2NC2C2CN12,2.47194,2.08198,1.42861,4.9902,71.97,-0.2389,0.0135,0.2524,...,0,0,0,0,0,0,0,0,0,0
19998,gdb_52674,CC#CC(O)C1(C)CC1,2.63754,0.93810,0.84890,1.9889,87.48,-0.2487,0.0390,0.2877,...,0,0,0,0,0,0,0,0,0,0


# Mordred descriptors

In [None]:
calc = Calculator(descriptors, ignore_3D=True)
mols = [Chem.MolFromSmiles(smi) for smi in df_rdkit.smiles]
df_mordred = calc.pandas(mols)
df = pd.concat([df_rdkit, df_mordred], axis=1)

In [None]:
df

Unnamed: 0,mol_id,smiles,A,B,C,mu,alpha,homo,lumo,gap,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,gdb_89835,OC1CC2(CC2)CC1=O,3.19334,1.22630,0.98880,2.7195,75.40,-0.2512,-0.0284,0.2229,...,9.458840,60.064540,126.068080,6.635162,79,9,52.0,63.0,3.284722,1.861111
1,gdb_36873,C1CC1NC2=NCCO2,4.92517,0.92431,0.81835,1.3423,77.99,-0.2144,0.0643,0.2788,...,8.761080,57.095610,126.079313,6.635753,92,6,46.0,52.0,1.972222,2.000000
2,gdb_19320,C#CC12CC1C1OC21,4.99590,1.69319,1.50919,1.4352,68.32,-0.2249,0.0250,0.2499,...,10.453947,64.774173,106.041865,7.574419,56,6,56.0,78.0,2.145833,1.638889
3,gdb_96404,CC1OCOCC1C#C,2.07981,1.71184,1.20167,1.8563,76.89,-0.2512,0.0501,0.3013,...,8.689296,37.472182,126.068080,6.635162,86,10,40.0,44.0,3.472222,2.194444
4,gdb_44243,N=C1OC2C1CCC2=O,3.22781,1.45953,1.19584,1.8143,69.64,-0.2506,-0.0448,0.2058,...,9.587475,54.352054,125.047678,7.815480,80,9,50.0,61.0,3.194444,1.916667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,gdb_67709,CC12C3COC1OCC23,2.25194,1.97852,1.70017,2.7998,73.16,-0.2496,0.0810,0.3306,...,10.121538,63.514450,126.068080,6.635162,70,9,60.0,81.0,2.395833,1.777778
19996,gdb_23674,c1c(ncnc1F)F,4.22771,1.81684,1.27074,0.9724,50.29,-0.2919,-0.0473,0.2447,...,8.479907,35.755147,116.018605,11.601860,61,7,36.0,38.0,3.222222,1.833333
19997,gdb_45091,O=C1CC2NC2C2CN12,2.47194,2.08198,1.42861,4.9902,71.97,-0.2389,0.0135,0.2524,...,9.791046,63.010421,124.063663,7.297863,76,11,58.0,75.0,2.305556,1.777778
19998,gdb_52674,CC#CC(O)C1(C)CC1,2.63754,0.93810,0.84890,1.9889,87.48,-0.2487,0.0390,0.2877,...,9.180809,58.176765,124.088815,5.908991,91,9,44.0,51.0,4.173611,2.083333


In [None]:
df.to_csv('data_with_descriptors.csv', index = False)