In [1]:
from rdkit import Chem
import pandas as pd
from chembl_structure_pipeline import standardize_mol, get_parent_mol

In [2]:
initial_smi = pd.read_excel(r'../prediction_and_selection/Prediction_initializers_ROR_activ.xlsx')
initial_smi = list(initial_smi['SMILES'])
initial_smi = pd.DataFrame(initial_smi, columns=['SMILES'])
print("Initial data description: ")
print(initial_smi.describe())

print("SMILES used to predict new molecules description: ")
print(initial_smi.describe())

print("Examples of initial structures: ")
print(initial_smi[0:3])

Initial data description: 
                                                   SMILES
count                                                   5
unique                                                  5
top     CC(=O)Nc1ccc2c(c1)CCCN2Cc1ccc(C(O)(C(F)(F)F)C(...
freq                                                    1
SMILES used to predict new molecules description: 
                                                   SMILES
count                                                   5
unique                                                  5
top     CC(=O)Nc1ccc2c(c1)CCCN2Cc1ccc(C(O)(C(F)(F)F)C(...
freq                                                    1
Examples of initial structures: 
                                              SMILES
0               CCC(=O)Nc1ccc2nn(-c3ccc(CC)cc3)nc2c1
1                Cc1cccc(C(=O)Nc2cc(-c3ccccc3)no2)c1
2  CC(=O)N1CCN(Cc2ccc(-c3ccc(C(O)(C(F)(F)F)C(F)(F...


In [5]:
initial_smi0 = pd.read_excel(r'../model/data/RORgamma_active_compounds.xlsx')
initial_smi0 = list(initial_smi0['SMILES_canonical'])
initial_smi0 = pd.DataFrame(initial_smi0)
print("Initial data description: ")
print(initial_smi0.describe())
initial_smi_ = list(initial_smi0[0])
corrected_initial_list = [str(smi).split(' ')[0] for smi in initial_smi_]
initial_smi0['SMILES_clean'] = corrected_initial_list
all_kiniase = initial_smi0['SMILES_clean']

print("All Kinase Inhibitors dataset molecules description: ")
print(all_kiniase.describe())

print("Examples of initial structures: ")
print(all_kiniase[0:3])

Initial data description: 
                                                        0
count                                                  36
unique                                                 35
top     O=C(O)C1CCC(c2nn(C(=O)c3c(Cl)cccc3C(F)(F)F)c3c...
freq                                                    2
All Kinase Inhibitors dataset molecules description: 
count                                                    36
unique                                                   35
top       O=C(O)C1CCC(c2nn(C(=O)c3c(Cl)cccc3C(F)(F)F)c3c...
freq                                                      2
Name: SMILES_clean, dtype: object
Examples of initial structures: 
0    CC(C)CCCC(C)(O)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC21C
1    CC(C)CCC(O)C(C)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC12C
2    CC(CCCC(C)(C)O)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC12C
Name: SMILES_clean, dtype: object


In [6]:
smiles = all_kiniase
smiles.head()

0     CC(C)CCCC(C)(O)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC21C
1     CC(C)CCC(O)C(C)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC12C
2     CC(CCCC(C)(C)O)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC12C
3    CC1CCC2(C(=O)O)CCC3(C)C(=CCC4C5(C)CCC(O)C(C)(C...
4    CC1OC(OC2C(O)CC(OC3C(O)CC(OC4CCC5(C)C(CCC6C5CC...
Name: SMILES_clean, dtype: object

In [7]:
new_set0 = smiles.to_list()

In [8]:
dff_15 = pd.DataFrame(new_set0,columns=['SMILES'])
dff_15.shape

(36, 1)

In [9]:
dff_17 = dff_15.drop_duplicates()
dff_17.shape

(35, 1)

In [10]:
dff_17 = dff_17.reset_index()

In [11]:
new_set1 = dff_17['SMILES'].to_list()
new_set1[0]

'CC(C)CCCC(C)(O)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC21C'

In [12]:
dff_17['SMILES'].describe()

count                                                    35
unique                                                   35
top       CC(=O)N1CCN(c2ccc(CN(CC(C)C)S(=O)(=O)Cc3ccccc3...
freq                                                      1
Name: SMILES, dtype: object

In [13]:
all_kiniase = [Chem.MolToSmiles(get_parent_mol(Chem.MolFromSmiles(smiles), neutralize=True, check_exclusion=True, verbose=False)[0]) for smiles in new_set1]

In [14]:
all_kiniase[0:5]

['CC(C)CCCC(C)(O)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC21C',
 'CC(C)CCC(O)C(C)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC12C',
 'CC(CCCC(C)(C)O)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC12C',
 'CC1CCC2(C(=O)O)CCC3(C)C(=CCC4C5(C)CCC(O)C(C)(C)C5CCC43C)C2C1C',
 'CC1OC(OC2C(O)CC(OC3C(O)CC(OC4CCC5(C)C(CCC6C5CC(O)C5(C)C(C7=CC(=O)OC7)CCC65O)C4)OC3C)OC2C)CC(O)C1O']

In [15]:
all_kiniase = list(all_kiniase)

In [16]:
all_kiniase_cannonical = [Chem.MolToSmiles(Chem.MolFromSmiles(smi), isomericSmiles=False) for smi in all_kiniase]

In [17]:
all_kiniase_cannonical[0:5]

['CC(C)CCCC(C)(O)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC21C',
 'CC(C)CCC(O)C(C)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC12C',
 'CC(CCCC(C)(C)O)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC12C',
 'CC1CCC2(C(=O)O)CCC3(C)C(=CCC4C5(C)CCC(O)C(C)(C)C5CCC43C)C2C1C',
 'CC1OC(OC2C(O)CC(OC3C(O)CC(OC4CCC5(C)C(CCC6C5CC(O)C5(C)C(C7=CC(=O)OC7)CCC65O)C4)OC3C)OC2C)CC(O)C1O']

In [18]:
len(all_kiniase_cannonical)

35

In [19]:
to_remove_duplicates = pd.DataFrame(all_kiniase_cannonical, columns=['SMILES'])
to_remove_duplicates = to_remove_duplicates.drop_duplicates()
to_remove_duplicates.describe()

Unnamed: 0,SMILES
count,35
unique,35
top,CC(=O)N1CCN(c2ccc(CN(CC(C)C)S(=O)(=O)Cc3ccccc3...
freq,1


In [20]:
to_remove_duplicates = to_remove_duplicates.reset_index()
all_kiniase_cannonical = to_remove_duplicates['SMILES']

In [22]:
whole_data_set = pd.read_parquet('../model/data/zinc20_selected_to_create_model_processed.parquet') #this file is correct

In [23]:
whole_data_set.head()

Unnamed: 0,index,smiles,SELFIES,SELFIES_length,SMILES_canonical,SELFIES_canonical,SELFIES_length_canonical
0,563932204,CNC(=O)CCCN1CCN(S(C)(=O)=O)[C@@H](C)C1,[C][N][C][=Branch1][C][=O][C][C][C][N][C][C][N...,30,CNC(=O)CCCN1CCN(S(C)(=O)=O)C(C)C1,[C][N][C][=Branch1][C][=O][C][C][C][N][C][C][N...,30
1,51885268,O=C(Cn1cc([N+](=O)[O-])cn1)N1CC[C@@H](CO)C1,[O][=C][Branch1][S][C][N][C][=C][Branch1][=Bra...,30,O=C(Cn1cc([N+](=O)[O-])cn1)N1CCC(CO)C1,[O][=C][Branch1][S][C][N][C][=C][Branch1][=Bra...,30
2,1099938503,COCCCN1CC[C@@H](NC(=O)c2cnco2)[C@H](O)C1,[C][O][C][C][C][N][C][C][C@@H1][Branch1][=N][N...,30,COCCCN1CCC(NC(=O)c2cnco2)C(O)C1,[C][O][C][C][C][N][C][C][C][Branch1][=N][N][C]...,30
3,1368537405,C[C@@H](O)CNC[C@H]1CN(C(=O)Cc2cnc[nH]2)CCO1,[C][C@@H1][Branch1][C][O][C][N][C][C@H1][C][N]...,30,CC(O)CNCC1CN(C(=O)Cc2cnc[nH]2)CCO1,[C][C][Branch1][C][O][C][N][C][C][C][N][Branch...,30
4,1379339000,C[C@H](CNCc1cn(C)nn1)NC(=O)COCC1CC1,[C][C@H1][Branch1][=C][C][N][C][C][=C][N][Bran...,30,CC(CNCc1cn(C)nn1)NC(=O)COCC1CC1,[C][C][Branch1][=C][C][N][C][C][=C][N][Branch1...,30


In [24]:
whole_data_set.describe()

Unnamed: 0,index,SELFIES_length,SELFIES_length_canonical
count,121000.0,121000.0,121000.0
mean,906010400.0,39.981058,40.0
std,493936200.0,4.466452,4.472154
min,223827.0,30.0,30.0
25%,510092900.0,37.0,37.0
50%,813387200.0,40.0,40.0
75%,1350868000.0,43.0,43.0
max,2113474000.0,50.0,50.0


## The best option is to compare cannonical form of both

In [25]:
first_check = 0
for smi in range(len(all_kiniase_cannonical)):
    if all_kiniase_cannonical[smi] in list(whole_data_set['SMILES_canonical']):
        first_check +=1
        print("Present in training data : "+str(all_kiniase_cannonical[smi]))
    else:
        pass

In [26]:
first_check

0

One can conclude that there is no ROR gamma active compound in training data

In [27]:
a = 1
b = [1,2,3]
if a in b:
    print(a)
else:
    pass

1


## Whole dataset checking

In [28]:
whole_data_set_almost_1M = pd.read_parquet('../model/data/zinc20_FK_DC_BB_JA_HE_GA_KG_IC_CB_HJ_processed.parquet')

In [29]:
whole_data_set_almost_1M.head()

Unnamed: 0,smiles
336474016,CC1(C)COCCN1C(=O)CCNC(N)=O
1086464356,CCN[C@H]1C[C@H](CNC(=O)[C@@H]2CNC(=O)N2)C1
248404898,CO[C@H](C)CN1CCOC[C@H]1C(N)=O
1574709678,O=C(O)CCNC(=O)c1ccc(CO)nc1
1119650207,C[C@@H](CC(=O)O)C(=O)N1C[C@H](O)C[C@H]1CO


In [30]:
mols = [Chem.MolFromSmiles(smiles) for smiles in whole_data_set_almost_1M['smiles']]

In [31]:
whole_data_set_almost_1M['SMILES_canonical'] = [Chem.MolToSmiles(mol) for mol in mols]

In [32]:
second_check = 0
for smi in range(len(all_kiniase_cannonical)):
    if all_kiniase_cannonical[smi] in list(whole_data_set_almost_1M['SMILES_canonical']):
        second_check +=1
        print("Present in whole ZINC dataset used : "+str(all_kiniase_cannonical[smi]))
    else:
        pass

In [33]:
second_check

0

## Check if initial data is present in training dataset

In [34]:
initial_kiniase_cannonical = [Chem.MolToSmiles(Chem.MolFromSmiles(smi), isomericSmiles=False) for smi in initial_smi['SMILES']]
initial_check = 0
for smi in range(len(all_kiniase_cannonical)):
    if all_kiniase_cannonical[smi] in list(whole_data_set['SMILES_canonical']):
        initial_check +=1
        print("Present in whole ZINC dataset used : "+str(all_kiniase_cannonical[smi]))
    else:
        pass

In [35]:
initial_check

0

## Checking if generated structures are present in ZINC dataset

In [36]:
tensor_rotation_0_1 = pd.read_excel('Molecules_generated_tensor_scaling_0_1.xlsx')

In [37]:
tensor_rotation_0_1 = tensor_rotation_0_1['SMILES_From_Tensor_rotation']

In [38]:
tensor_rotation_0_1_cannonical = [Chem.MolToSmiles(Chem.MolFromSmiles(smi), isomericSmiles=False) for smi in tensor_rotation_0_1]

In [39]:
tensor_rotation_0_1_cannonical[0:5]

['CCC(=O)NCC1=CN(c2ncnc(C#N)c2C)C1SOC',
 'CCC(=O)NCc1ccnn1[SH]1C=C(c2ccccc2F)NN1',
 'CCC(=O)NCc1cncc(S(=O)(=O)C(C)CC#N)n1',
 'CCC(=O)N1CC2C1CC2(C#N)S(=O)c1ccccn1',
 'CCC(=O)N1c2ccn(-c3ncnc(C#N)c3C)c2[SH]1C']

In [40]:
len(tensor_rotation_0_1_cannonical)

55

In [41]:
third_check = 0
for smi in range(len(tensor_rotation_0_1_cannonical)):
    if tensor_rotation_0_1_cannonical[smi] in list(whole_data_set['SMILES_canonical']):
        third_check +=1
        print("Present in whole ZINC dataset used : "+str(tensor_rotation_0_1_cannonical[smi]))
    else:
        pass

In [42]:
third_check

0

In [43]:
fourth_check = 0
for smi in range(len(tensor_rotation_0_1_cannonical)):
    if tensor_rotation_0_1_cannonical[smi] in list(whole_data_set_almost_1M['SMILES_canonical']):
        third_check +=1
        print("Present in whole ZINC dataset used : "+str(tensor_rotation_0_1_cannonical[smi]))
    else:
        pass

In [44]:
fourth_check

0

## Check if generated data is in initial data

In [46]:
tensor_0_1_check = 0
for smi in range(len(tensor_rotation_0_1_cannonical)):
    if tensor_rotation_0_1_cannonical[smi] in list(initial_kiniase_cannonical):
        tensor_0_1_check +=1
        print("Present in initial dataset (ROR-y activ) used : "+str(tensor_rotation_0_1_cannonical[smi]))
    else:
        pass

Present in initial dataset (ROR-y activ) used : Cc1cccc(C(=O)Nc2cc(-c3ccccc3)no2)c1


In [47]:
tensor_0_1_check

1

In [48]:
tensor_rotation_0_2 = pd.read_excel('Molecules_generated_tensor_scaling_0_2.xlsx')

In [49]:
tensor_rotation_0_2 = tensor_rotation_0_2['SMILES_From_Tensor_rotation']

In [50]:
tensor_rotation_0_2_cannonical = [Chem.MolToSmiles(Chem.MolFromSmiles(smi), isomericSmiles=False) for smi in tensor_rotation_0_2]

In [51]:
tensor_rotation_0_2_cannonical[0:5]

['CCC(=O)NCC1=NN=C(C=C(C)C=CC(=N)OC)SC1',
 'CCC(=O)NCc1ccnn1SC=C(NN)c1ccccc1F',
 'CCC1(C)CCC(=O)NCCC1Sc1nc[nH]n1',
 'CCC(=O)NC1CC(c2n[nH]cc2C(C)=C=N)O1',
 'CCC(=O)NCc1ccnn1Sc1[nH][nH]c1-c1ccccc1F']

In [52]:
len(tensor_rotation_0_2_cannonical)

78

In [53]:
fifth_check = 0
for smi in range(len(tensor_rotation_0_2_cannonical)):
    if tensor_rotation_0_2_cannonical[smi] in list(whole_data_set['SMILES_canonical']):
        fifth_check +=1
        print("Present in whole ZINC dataset used : "+str(tensor_rotation_0_2_cannonical[smi]))
    else:
        pass

In [54]:
fifth_check

0

In [55]:
sixth_check = 0
for smi in range(len(tensor_rotation_0_2_cannonical)):
    if tensor_rotation_0_2_cannonical[smi] in list(whole_data_set_almost_1M['SMILES_canonical']):
        sixth_check +=1
        print("Present in whole ZINC dataset used : "+str(tensor_rotation_0_2_cannonical[smi]))
    else:
        pass

In [56]:
sixth_check

0

In [57]:
tensor_0_2_check = 0
for smi in range(len(tensor_rotation_0_2_cannonical)):
    if tensor_rotation_0_2_cannonical[smi] in list(initial_kiniase_cannonical):
        tensor_0_2_check +=1
        print("Present in whole ZINC dataset used : "+str(tensor_rotation_0_2_cannonical[smi]))
    else:
        pass

Present in whole ZINC dataset used : Cc1cccc(C(=O)Nc2cc(-c3ccccc3)no2)c1


In [58]:
tensor_0_2_check

1

None structure from generated are present in ZINC databse