In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import os


import rdkit
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.Descriptors import CalcMolDescriptors
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator as MDC

# Download data

In [None]:
target = 'aa2ar'
url = f'https://dude.docking.org//targets/{target}'
data_path = './data'
path_model = './models'
pics = './images'

In [None]:
def dir_builder():
    if not os.path.exists(path_model):
        os.mkdir(path_model)
    if not os.path.exists(pics):
        os.mkdir(pics)
    if not os.path.exists(data):
        os.mkdir(data)

dir_builder()

In [None]:
active = pd.read_csv(f'{url}/actives_combined.ism', header=None, sep=' ', on_bad_lines='skip')
inactive = pd.read_csv(f'{url}/inactives_combined.ism', header=None, sep=' ', on_bad_lines='skip')
decoys = pd.read_csv(f'{url}/decoys_final.ism', header=None, sep=' ', on_bad_lines='skip')

In [None]:
active = pd.DataFrame({'smiles':active[0], 'label': [1]*len(active[0])})
active_size = active.shape[0]

inactive = pd.DataFrame({'smiles': inactive[0], 'label': [0]*len(inactive[0])})
inactive_size = inactive.shape[0]

decoys = pd.DataFrame({'smiles':decoys[0], 'label': [0]*len(decoys[0])})
decoys_size = decoys.shape[0]

# Feature generation

In [None]:
def RDkit_descriptors(data):
    mols = [Chem.MolFromSmiles(i) for i in data.smiles]
    desc_names = list(CalcMolDescriptors(mols[0]).keys())
    desc_names.append("label")
    mol_descs = []
    for i in tqdm(range(len(data))):
        descriptors = CalcMolDescriptors(mols[i])
        descriptors['label'] = data.label[i]
        mol_descs.append(descriptors)
    return mol_descs, desc_names

In [None]:
mol_classes = [active, inactive, decoys]
classes_names = ['active', 'inactive', 'decoys']

In [None]:
for mol_class, class_name in zip(mol_classes, classes_names):
    print("Uploading", class_name, "dataset")
    mol_descs, desc_names = RDkit_descriptors(mol_class)
    descriptors_df = pd.DataFrame(mol_descs, columns=desc_names)
    descriptors_df.to_csv(f'{data_path}/{class_name}_descs_raw.csv', index=False)

# Data preparation

In [None]:
active = pd.read_csv(f'{data_path}/active_descs_raw.csv', index_col=None)
active_size = active.shape[0]

inactive = pd.read_csv(f'{data_path}/inactive_descs_raw.csv', index_col=None)
inactive_size = inactive.shape[0]

decoys = pd.read_csv(f'{data_path}/decoys_descs_raw.csv', index_col=None)
decoys_size = decoys.shape[0]

In [None]:
colors = sns.color_palette('pastel')[0:5]
data = [active_size, inactive_size, decoys_size]
plt.pie(data, labels=classes_names, colors = colors, autopct='%.0f%%', textprops={"fontsize":12})
plt.title('Raw data')
# plt.show()
plt.savefig(f'{pics}/raw_data.png')

# Data resampling

In [None]:
decoys_new_size = active_size - inactive_size
split_decoys = decoys.sample(n=decoys_new_size)

balanced_data = pd.concat([active, inactive, split_decoys], ignore_index=True)
balanced_data.to_csv(f'{data_path}/balanced_dataframe.csv',index=False)

In [None]:
colors = sns.color_palette('pastel')[0:5]
data = [active_size, inactive_size, decoys_new_size]
plt.pie(data, labels=classes_names, colors = colors, autopct='%.0f%%', textprops={"fontsize":12})
plt.title('Balanced data')
# plt.show()
plt.savefig(f'{pics}/balanced_data.png')