In [10]:
import pandas as pd
import numpy as np
import threading
from drugSimilarityTable import SimilarityTable
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem


In [2]:
path_similarities = 'sanitize_smiles.csv'
drugs_csv = pd.read_csv(path_similarities, sep=";")

In [3]:
numOfThreads = 4

In [4]:
csv_split_size = int(drugs_csv.shape[0] / numOfThreads)

In [5]:
csv_split_size

499743

In [6]:
testCSV = drugs_csv.iloc[:1000]
testCSV.shape

(1000, 41)

In [7]:
testCSV_split = int( testCSV.shape[0] / numOfThreads)
testCSV_split

250

In [52]:
df1 = testCSV.iloc[:250]
df2 = testCSV.iloc[250:500]
df3 = testCSV.iloc[500:750]
df4 = testCSV.iloc[750:]

In [9]:
df1

Unnamed: 0,sanitize-id,smiles,0,1,2,3,4,5,6,7,...,29,30,31,32,33,34,35,36,37,38
0,0,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl,0.0,,,,,,,,...,,,,,,,,,,
1,1,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1,1.0,,,,,,,,...,,,,,,,,,,
2,2,Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1,2.0,,,,,,,,...,,,,,,,,,,
3,3,Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1,3.0,,,,,,,,...,,,,,,,,,,
4,4,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1,4.0,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,245,Cn1cc(C2=C(c3cn(CCSC(=N)N)c4ccccc34)C(=O)NC2=O...,245.0,,,,,,,,...,,,,,,,,,,
246,246,Cn1cc(C2=C(c3cn(CCCCCSC(=N)N)c4ccccc34)C(=O)NC...,246.0,,,,,,,,...,,,,,,,,,,
247,247,Cc1ccc(Sc2cncc3sc(CO)cc23)cc1,247.0,,,,,,,,...,,,,,,,,,,
248,248,Cc1cc(C)cc(Sc2cncc3sc(C(N)=O)cc23)c1,248.0,,,,,,,,...,,,,,,,,,,


In [27]:
def threadStart(dataFrame):
    for drug in dataFrame.iterrows():
        # drug_id (id de la molecula actual)
        drug_id = drug[0]
        # drug_similarity_table (tabla de similaridad para la molecula actual)
        drug_similarity_table = SimilarityTable(drug_id)
        # drug_smiles (smiles de la molecula actual)
        drug_smiles = drug[1]['smiles']
        drug_molecule = Chem.MolFromSmiles(drug_smiles)
        drug_fingerprint = AllChem.GetMorganFingerprint(drug_molecule, 2)

        for drug_to_test in dataFrame.iterrows():
            # drug_to_test_id (id de la molecula a comparar)
            drug_to_test_id= drug_to_test[0]
            try:
                if drug_to_test_id !=drug_id:
                    # drug_to_test_smiles (smiles de la molecula a comparar)
                    drug_to_test_smiles = drug_to_test[1]['smiles']
                    drug_to_test_molecule = Chem.MolFromSmiles(drug_to_test_smiles)
                    drug_to_test_fingerprint = AllChem.GetMorganFingerprint(drug_to_test_molecule, 2)
                    drugs_similarity = DataStructs.DiceSimilarity(drug_fingerprint, drug_to_test_fingerprint)

                    drug_similarity_table.add_item(drug_to_test_id, drugs_similarity)
            except:
                print("Failed Smiles of Drug", drug_to_test_id)
                continue

        drug_similarity_table.print_dictionary()
        break
    

In [60]:
thread1 = threading.Thread(target = threadStart, args =(df1,))
thread2 = threading.Thread(target = threadStart, args =(df2,))
thread3 = threading.Thread(target = threadStart, args =(df3,))
#thread4 = threading.Thread(target = threadStart, args =(df4,))

thread1.start()
thread2.start()
thread3.start()
#thread4.start()

thread1.join()
thread2.join()
thread3.join()
#thread4.join()

33252  0.8676470588235294
5519 0.8709677419354839
590 0.8522727272727273
555 0.8470588235294118
524 0.8387096774193549
525 0.8387096774193549
714 0.8359788359788359
251 0.8245614035087719
591 0.8061224489795918
721 0.8061224489795918
1 0.7883211678832117
0.8392857142857143
316 0.8275862068965517
251 0.8245614035087719
276 0.8103448275862069
275 0.8070175438596491
277 0.8070175438596491
274 0.8034188034188035
1 0.7883211678832117
313 0.7747747747747747
263 0.7719298245614035
 0.8636363636363636
6 0.8529411764705882
57 0.8382352941176471
4 0.835820895522388
59 0.8333333333333334
251 0.8245614035087719
18 0.8181818181818182
1 0.7883211678832117
7 0.7794117647058824


In [62]:
dataFrames = ['0','0','0','0']
for i in range(numOfThreads):
    dataFrames[i] = np.split(testCSV,numOfThreads)

In [19]:
dataFrames[0]

[     sanitize-id                                             smiles      0  \
 0              0       Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl    0.0   
 1              1    Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1    1.0   
 2              2   Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1    2.0   
 3              3       Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1    3.0   
 4              4     Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1    4.0   
 ..           ...                                                ...    ...   
 495          495  Cc1c(C)c2c(c(C)c1O)CCC(C)(CN(C)CCOc1ccc(/C=C3\...  495.0   
 496          496      COc1cc2nc(N3CCC(C(=O)NCC4CC4)CC3)nc(N)c2cc1OC  496.0   
 497          497      CCCCNC(=O)C1CCN(c2nc(N)c3cc(OC)c(OC)cc3n2)CC1  497.0   
 498          498      CCCCNC(=O)C1CCCN(c2nc(N)c3cc(OC)c(OC)cc3n2)C1  498.0   
 499          499  CN/C(=C\[N+](=O)[O-])NC1c2cc(C#N)ccc2OC(C)(C)[...  499.0   
 
             1         2         3         4   5  