# Read data

In [13]:
import pandas as pd

data_1 = pd.read_csv('data/data_1.csv', sep=',')
data_5 = pd.read_csv('data/data_5.csv', sep=',')
display(data_1.head(2))
display(data_5.head(2))

Unnamed: 0,SMILES,ID
0,CC(C)(Sc1ccc(CCN(CCCCC2CCCCC2)C(=O)NC2CCCCC2)c...,CHEMBL21241
1,COc1ccc(CC2SC(=O)NC2=O)cc1C(=O)NCc1ccc(C(F)(F)...,CHEMBL24458


Unnamed: 0,SMILES,ID
0,CCO/N=C(\C)C(Cc1ccc(OCCc2nc(-c3ccccc3)oc2C)cc1...,CHEMBL388422
1,C/C(=N\OCC(C)C)C(Cc1ccc(OCCc2nc(-c3ccccc3)oc2C...,CHEMBL388645


# Task 2

In [14]:
from isdd.extractor import smile_to_mol, get_morgan_fgs

data_1['Mol'] = data_1['SMILES'].apply(smile_to_mol)
data_1['Morgan_fgs'] = data_1['Mol'].apply(get_morgan_fgs)

data_5['Mol'] = data_5['SMILES'].apply(smile_to_mol)
data_5['Morgan_fgs'] = data_5['Mol'].apply(get_morgan_fgs)



# Task 3

In [15]:
from isdd.similarity_pairwise import get_simi_pairwise

similarity_matrix = pd.DataFrame(columns=data_5['ID'], index=data_1['ID'])
for index1, row_1 in data_1.iterrows():
    for index5, row_5 in data_5.iterrows():
        simi_coeff = get_simi_pairwise(row_1['Morgan_fgs'], row_5['Morgan_fgs'])
        similarity_matrix.at[row_1['ID'], row_5['ID']] = simi_coeff
similarity_matrix.to_csv('data/similarity_matrix.csv', sep=',')
similarity_matrix.head(5)

ID,CHEMBL388422,CHEMBL388645,CHEMBL389297,CHEMBL389298,CHEMBL389903,CHEMBL390116,CHEMBL391620,CHEMBL391622,CHEMBL392474,CHEMBL392810,...,CHEMBL516510,CHEMBL516824,CHEMBL516964,CHEMBL518348,CHEMBL520272,CHEMBL520508,CHEMBL522366,CHEMBL522754,CHEMBL526740,CHEMBL537921
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CHEMBL21241,0.123711,0.121212,0.118812,0.111111,0.150538,0.163043,0.134615,0.145833,0.173469,0.153061,...,0.163265,0.175824,0.163265,0.172043,0.114583,0.134831,0.137615,0.118812,0.120879,0.133929
CHEMBL24458,0.135417,0.132653,0.141414,0.122449,0.3375,0.38961,0.156863,0.145833,0.138614,0.141414,...,0.266667,0.289157,0.252747,0.267442,0.103093,0.122222,0.117117,0.118812,0.133333,0.133929
CHEMBL107367,0.575342,0.56,0.586667,0.487179,0.228261,0.189474,0.252525,0.26087,0.222222,0.252632,...,0.165049,0.164948,0.153846,0.161616,0.378049,0.103093,0.181818,0.239583,0.213483,0.242991
CHEMBL278590,0.131313,0.117647,0.16,0.13,0.208791,0.222222,0.186275,0.164948,0.156863,0.171717,...,0.158416,0.170213,0.158416,0.166667,0.122449,0.155556,0.154545,0.115385,0.141304,0.12069
CHEMBL81248,0.550725,0.535211,0.520548,0.459459,0.193182,0.166667,0.26087,0.285714,0.228261,0.275862,...,0.217391,0.22093,0.204301,0.215909,0.4,0.125,0.232323,0.206522,0.149425,0.213592


# Task 4

In [16]:
from isdd.percentage_simi import get_percentage_simi
import numpy as np

threshold_min_list = np.arange(0.0, 1.0, 0.1)
threshold_max_list = np.arange(0.1, 1.1, 0.1)
matrix = similarity_matrix.values
total_percentage = 0.0
for i in range(len(threshold_min_list)):
    threshold_min = round(threshold_min_list[i], 1)
    threshold_max = round(threshold_max_list[i], 1)
    percentage = get_percentage_simi(matrix, threshold_min, threshold_max)
    print(f'{threshold_min}<T<={threshold_max}:', percentage, '%')
    total_percentage+=percentage

print('Total', total_percentage)

0.0<T<=0.1: 1.27 %
0.1<T<=0.2: 57.82 %
0.2<T<=0.3: 28.02 %
0.3<T<=0.4: 8.15 %
0.4<T<=0.5: 3.21 %
0.5<T<=0.6: 1.06 %
0.6<T<=0.7: 0.3 %
0.7<T<=0.8: 0.1 %
0.8<T<=0.9: 0.05 %
0.9<T<=1.0: 0.02 %
Total 99.99999999999999
