In [1]:
from rdkit import Chem
from rdkit.Chem import PandasTools
import pandas as pd
import numpy as np
import time
from rdkit.Chem.Draw import IPythonConsole #Needed to show molecules
from rdkit.Chem.Draw.MolDrawing import MolDrawing, DrawingOptions #Only needed if modifying defaults
from rdkit.Chem import PyMol



In [None]:
data_full1 = pd.read_csv('fulldb1.txt', sep="	", header=0)

In [2]:
patt1 = Chem.MolFromSmarts('C(c)=C(c)')
patt2 = Chem.MolFromSmarts('N(c)=N(c)')

In [9]:
#Divide the whole sets into small batches
samplesize=100000
samplenum=len(data_full1)//samplesize+1

In [14]:
all_DataFrame_A=[]
all_DataFrame_D=[]
version_id_A =[]
version_id_D = []
parent_id_A = []
parent_id_D = []
logfile=open('logfile.txt','w')
#iterate the set to find compounds with certain functional groups
for i in range(samplenum):
    if i==samplenum-1:
        data_segment_1=data_full1.loc[samplesize*i:]
    else:
        data_segment_1 = data_full1.loc[samplesize*i:samplesize*(i+1)]
    start = time.time()
    
    print('Batch no. {:d}'.format(i), end="  ")

    suppl = data_segment_1['isosmiles'].tolist()
    supp2 = data_segment_1['version_id'].tolist()
    supp3 = data_segment_1['parent_id'].tolist()
    ms = []
    for smi in suppl:
        try: 
            ms.append(Chem.MolFromSmiles(smi))
        except:
            ms.append(None)
            
    ms_filter = list(filter(None, ms)) 
    id_filter1 = list(filter(None, supp2)) 
    id_filter2 = list(filter(None, supp3))
    matches_1d = []
    matches_1a = []
    indices1d = []
    indices1a = []
    for i,mol in enumerate(ms_filter):
        if mol.HasSubstructMatch(patt1):
            matches_1d.append(mol)
            indices1d.append(i)
        if mol.HasSubstructMatch(patt2):
            matches_1a.append(mol)
            indices1a.append(i)
    id_final1d = []
    id_final1a = []
    id_final2d = []
    id_final2a = []
    for i in indices1d:
        id_final1d.append(id_filter1[i])
        id_final2d.append(id_filter2[i])
    for i in indices1a:
        id_final2a.append(id_filter2[i])
        id_final1a.append(id_filter1[i])
        
    
    logfile.write('Batch number {:>20d}\n'.format(i))
    logfile.write('Number of Diarylethene matches: {:>8d}\n'.format(len(matches_1d)))
    logfile.write('Number of Azobenzene matches: {:>8d}\n'.format(len(matches_1a)))
    
    smiles_D, smiles_A=[],[]
    for molecule in matches_1d:
        smiles=Chem.MolToSmiles(molecule)
        smiles_D.append(smiles)
    for molecule in matches_1a:
        smiles=Chem.MolToSmiles(molecule)
        smiles_A.append(smiles)
    all_DataFrame_D=all_DataFrame_D+smiles_D
    all_DataFrame_A=all_DataFrame_A+smiles_A
    version_id_D = id_final1d + version_id_D
    version_id_A = id_final1a + version_id_A
    
    parent_id_A = id_final2a + parent_id_A
    parent_id_D = id_final2d + parent_id_D
    end = time.time()
    print('Finished, run time: {:>10.3f}s'.format(end - start))
all_id_D = pd.DataFrame(all_id_D)
all_id_A = pd.DataFrame(all_id_A)
all_DataFrame_A=pd.DataFrame(all_DataFrame_A)
all_DataFrame_D=pd.DataFrame(all_DataFrame_D)

Final_D = pd.concat([all_DataFrame_D,all_id_D],axis = 1)
Final_A = pd.concat([all_DataFrame_A,all_id_A],axis = 1)
Final_A.to_csv('MolPort_A_batch_1.csv')
Final_D.to_csv('MolPort_D_batch_1.csv')
logfile.close()   


Batch no. 0  Finished, run time:     45.620s
Batch no. 1  Finished, run time:     33.730s
Batch no. 2  Finished, run time:     39.063s
Batch no. 3  Finished, run time:     43.235s
Batch no. 4  Finished, run time:     48.164s
Batch no. 5  Finished, run time:     46.078s
Batch no. 6  Finished, run time:     45.341s
Batch no. 7  Finished, run time:     46.399s
Batch no. 8  Finished, run time:     54.953s
Batch no. 9  Finished, run time:     54.589s
Batch no. 10  Finished, run time:     50.968s
Batch no. 11  Finished, run time:     56.114s
Batch no. 12  Finished, run time:     59.264s
Batch no. 13  Finished, run time:     65.614s
Batch no. 14  Finished, run time:     58.369s
Batch no. 15  Finished, run time:     57.053s
Batch no. 16  Finished, run time:     56.112s
Batch no. 17  Finished, run time:     54.089s
Batch no. 18  Finished, run time:     55.215s
Batch no. 19  Finished, run time:     51.001s
Batch no. 20  Finished, run time:     49.275s
Batch no. 21  Finished, run time:     62.959

Batch no. 177  Finished, run time:     30.481s
Batch no. 178  Finished, run time:     31.195s
Batch no. 179  Finished, run time:     31.309s
Batch no. 180  Finished, run time:     30.733s
Batch no. 181  Finished, run time:     31.586s
Batch no. 182  Finished, run time:     31.200s
Batch no. 183  Finished, run time:     30.449s
Batch no. 184  Finished, run time:     29.179s
Batch no. 185  Finished, run time:     30.723s
Batch no. 186  Finished, run time:     30.339s
Batch no. 187  Finished, run time:     31.366s
Batch no. 188  Finished, run time:     29.674s
Batch no. 189  Finished, run time:     28.971s
Batch no. 190  Finished, run time:     28.963s
Batch no. 191  Finished, run time:     29.783s
Batch no. 192  Finished, run time:     39.760s
Batch no. 193  Finished, run time:     48.365s
Batch no. 194  Finished, run time:     50.007s
Batch no. 195  Finished, run time:     50.010s
Batch no. 196  Finished, run time:     48.569s
Batch no. 197  Finished, run time:     47.463s
Batch no. 198

In [15]:
version_id_A = pd.DataFrame(version_id_A,columns=['version_id'])
version_id_D = pd.DataFrame(version_id_D,columns=['version_id'])
parent_id_A = pd.DataFrame(parent_id_A,columns=['parent_id'])
parent_id_D = pd.DataFrame(parent_id_D,columns=['parent_id'])

all_DataFrame_A=pd.DataFrame(all_DataFrame_A,columns=['isosmiles_A'])
all_DataFrame_D=pd.DataFrame(all_DataFrame_D,columns=['isosmiles_D'])

Final_D = pd.concat([all_DataFrame_D,version_id_A,parent_id_A],axis = 1)
Final_A = pd.concat([all_DataFrame_A,version_id_A,parent_id_A],axis = 1)
Final_A.to_csv('eMol_all_A_batch.csv')
Final_D.to_csv('eMol_all_D_batch.csv')
logfile.close()


In [16]:
Final_D.head()

Unnamed: 0,isosmiles_D,version_id,parent_id
0,CN(C)c1ccc(/C=C/c2nc3ccccc3c(=O)n2-c2ccc(O)cc2...,336464459.0,336464459.0
1,O=c1[nH]c(/C=C/c2ccc(O)cc2)c([N+](=O)[O-])c(=O...,336478551.0,336478551.0
2,Cc1ccccc1/C=C/c1ccccc1,336483667.0,336483667.0
3,O=C(O)C1=Cc2ccccc21,336486210.0,336486210.0
4,N=C(N)c1ccc(/C=C/c2ccc(C(=N)N)cc2)cc1,336488762.0,336488762.0


In [33]:
data_full2 = pd.read_csv('fulldb2.txt', sep="	", header=0)
print(len(data_full2))
samplenum=len(data_full2)//samplesize+1

5000010


In [34]:
all_DataFrame_A=[]
all_DataFrame_D=[]
all_id_D = []
all_id_A = []
logfile=open('logfile.txt','w')
for i in range(samplenum):
    if i==samplenum-1:
        data_segment_1=data_full2.loc[samplesize*i:]
    else:
        data_segment_1 = data_full2.loc[samplesize*i:samplesize*(i+1)]
    start = time.time()
    
    print('Batch no. {:d}'.format(i), end="  ")

    suppl = data_segment_1['SMILES'].tolist()
    supp2 = data_segment_1['MOLPORTID'].tolist()
    ms = []
    for smi in suppl:
        try: 
            ms.append(Chem.MolFromSmiles(smi))
        except:
            ms.append(None)
    ms_filter = list(filter(None, ms)) 
    id_filter = list(filter(None, supp2))
    indices1 = []
    indices2 = []
    matches_1d = []
    matches_1a = []
    
    for i,mol in enumerate(ms_filter):
        if mol.HasSubstructMatch(patt1):
            matches_1d.append(mol)
            indices1.append(i)
        if mol.HasSubstructMatch(patt2):
            matches_1a.append(mol)
            indices2.append(i)
    id_final1 = []
    id_final2 = []
    for i in indices1:
        id_final1.append(id_filter[i])
    for i in indices2:
        id_final2.append(id_filter[i])

    
    logfile.write('Batch number {:>20d}\n'.format(i))
    logfile.write('Number of Diarylethene matches: {:>8d}\n'.format(len(matches_1d)))
    logfile.write('Number of Azobenzene matches: {:>8d}\n'.format(len(matches_1a)))
    
    smiles_D, smiles_A=[],[]
    for molecule in matches_1d:
        smiles=Chem.MolToSmiles(molecule)
        smiles_D.append(smiles)
    for molecule in matches_1a:
        smiles=Chem.MolToSmiles(molecule)
        smiles_A.append(smiles)
    all_DataFrame_D=all_DataFrame_D+smiles_D
    all_DataFrame_A=all_DataFrame_A+smiles_A
    all_id_D = id_final1 + all_id_D
    all_id_A = id_final2 + all_id_A
    end = time.time()
    print('Finished, run time: {:>10.3f}s'.format(end - start))
all_id_D = pd.DataFrame(all_id_D)
all_id_A = pd.DataFrame(all_id_A)
all_DataFrame_A=pd.DataFrame(all_DataFrame_A)
all_DataFrame_D=pd.DataFrame(all_DataFrame_D)

Final_D = pd.concat([all_DataFrame_D,all_id_D],axis = 1)
Final_A = pd.concat([all_DataFrame_A,all_id_A],axis = 1)
Final_A.to_csv('MolPort_A_batch_2.csv')
Final_D.to_csv('MolPort_D_batch_2.csv')
logfile.close()


Batch no. 0  Finished, run time:     44.042s
Batch no. 1  Finished, run time:     49.687s
Batch no. 2  Finished, run time:     55.276s
Batch no. 3  Finished, run time:     61.109s
Batch no. 4  Finished, run time:     53.748s
Batch no. 5  Finished, run time:     50.768s
Batch no. 6  Finished, run time:     57.050s
Batch no. 7  Finished, run time:     61.799s
Batch no. 8  Finished, run time:     75.251s
Batch no. 9  Finished, run time:     72.282s
Batch no. 10  Finished, run time:     64.802s
Batch no. 11  Finished, run time:     56.189s
Batch no. 12  Finished, run time:     56.447s
Batch no. 13  Finished, run time:     56.700s
Batch no. 14  Finished, run time:     61.174s
Batch no. 15  Finished, run time:     60.956s
Batch no. 16  Finished, run time:     61.172s
Batch no. 17  Finished, run time:     58.681s
Batch no. 18  Finished, run time:     61.364s
Batch no. 19  Finished, run time:     65.873s
Batch no. 20  Finished, run time:     66.785s
Batch no. 21  Finished, run time:     87.926

In [35]:
data_full3 = pd.read_csv('fulldb3.txt', sep="	", header=0)
print(len(data_full3))
samplenum=len(data_full3)//samplesize+1

5000009


In [36]:
all_DataFrame_A=[]
all_DataFrame_D=[]
all_id_D = []
all_id_A = []
logfile=open('logfile.txt','w')
for i in range(samplenum):
    if i==samplenum-1:
        data_segment_1=data_full3.loc[samplesize*i:]
    else:
        data_segment_1 = data_full3.loc[samplesize*i:samplesize*(i+1)]
    start = time.time()
    
    print('Batch no. {:d}'.format(i), end="  ")

    suppl = data_segment_1['SMILES'].tolist()
    supp2 = data_segment_1['MOLPORTID'].tolist()
    ms = []
    for smi in suppl:
        try: 
            ms.append(Chem.MolFromSmiles(smi))
        except:
            ms.append(None)
    ms_filter = list(filter(None, ms)) 
    id_filter = list(filter(None, supp2))
    indices1 = []
    indices2 = []
    matches_1d = []
    matches_1a = []
    
    for i,mol in enumerate(ms_filter):
        if mol.HasSubstructMatch(patt1):
            matches_1d.append(mol)
            indices1.append(i)
        if mol.HasSubstructMatch(patt2):
            matches_1a.append(mol)
            indices2.append(i)
    id_final1 = []
    id_final2 = []
    for i in indices1:
        id_final1.append(id_filter[i])
    for i in indices2:
        id_final2.append(id_filter[i])


    
    logfile.write('Batch number {:>20d}\n'.format(i))
    logfile.write('Number of Diarylethene matches: {:>8d}\n'.format(len(matches_1d)))
    logfile.write('Number of Azobenzene matches: {:>8d}\n'.format(len(matches_1a)))
    
    smiles_D, smiles_A=[],[]
    for molecule in matches_1d:
        smiles=Chem.MolToSmiles(molecule)
        smiles_D.append(smiles)
    for molecule in matches_1a:
        smiles=Chem.MolToSmiles(molecule)
        smiles_A.append(smiles)
    all_DataFrame_D=all_DataFrame_D+smiles_D
    all_DataFrame_A=all_DataFrame_A+smiles_A
    all_id_D = id_final1 + all_id_D
    all_id_A = id_final2 + all_id_A
    end = time.time()
    print('Finished, run time: {:>10.3f}s'.format(end - start))
all_id_D = pd.DataFrame(all_id_D)
all_id_A = pd.DataFrame(all_id_A)
all_DataFrame_A=pd.DataFrame(all_DataFrame_A)
all_DataFrame_D=pd.DataFrame(all_DataFrame_D)

Final_D = pd.concat([all_DataFrame_D,all_id_D],axis = 1)
Final_A = pd.concat([all_DataFrame_A,all_id_A],axis = 1)
Final_A.to_csv('MolPort_A_batch_3.csv')
Final_D.to_csv('MolPort_D_batch_3.csv')
logfile.close()


Batch no. 0  Finished, run time:     53.949s
Batch no. 1  Finished, run time:     63.953s
Batch no. 2  Finished, run time:     63.550s
Batch no. 3  Finished, run time:     42.917s
Batch no. 4  Finished, run time:     35.643s
Batch no. 5  Finished, run time:     36.509s
Batch no. 6  Finished, run time:     29.809s
Batch no. 7  Finished, run time:     28.345s
Batch no. 8  Finished, run time:     30.473s
Batch no. 9  Finished, run time:     30.628s
Batch no. 10  Finished, run time:     30.515s
Batch no. 11  Finished, run time:     30.675s
Batch no. 12  Finished, run time:     30.092s
Batch no. 13  Finished, run time:     33.720s
Batch no. 14  Finished, run time:     32.616s
Batch no. 15  Finished, run time:     28.803s
Batch no. 16  Finished, run time:     29.977s
Batch no. 17  Finished, run time:     29.604s
Batch no. 18  Finished, run time:     27.038s
Batch no. 19  Finished, run time:     29.306s
Batch no. 20  Finished, run time:     28.632s
Batch no. 21  Finished, run time:     29.217

In [37]:
data_full4 = pd.read_csv('fulldb4.txt', sep="	", header=0)
print(len(data_full4))
samplesize=100000
samplenum=len(data_full4)//samplesize+1

5000095


In [38]:
all_DataFrame_A=[]
all_DataFrame_D=[]
all_id_D = []
all_id_A = []
logfile=open('logfile.txt','w')
for i in range(samplenum):
    if i==samplenum-1:
        data_segment_1=data_full4.loc[samplesize*i:]
    else:
        data_segment_1 = data_full4.loc[samplesize*i:samplesize*(i+1)]
    start = time.time()
    
    print('Batch no. {:d}'.format(i), end="  ")

    suppl = data_segment_1['SMILES'].tolist()
    supp2 = data_segment_1['MOLPORTID'].tolist()
    ms = []
    for smi in suppl:
        try: 
            ms.append(Chem.MolFromSmiles(smi))
        except:
            ms.append(None)
    ms_filter = list(filter(None, ms)) 
    id_filter = list(filter(None, supp2))
    indices1 = []
    indices2 = []
    matches_1d = []
    matches_1a = []
    
    for i,mol in enumerate(ms_filter):
        if mol.HasSubstructMatch(patt1):
            matches_1d.append(mol)
            indices1.append(i)
        if mol.HasSubstructMatch(patt2):
            matches_1a.append(mol)
            indices2.append(i)
    id_final1 = []
    id_final2 = []
    for i in indices1:
        id_final1.append(id_filter[i])
    for i in indices2:
        id_final2.append(id_filter[i])

    
    logfile.write('Batch number {:>20d}\n'.format(i))
    logfile.write('Number of Diarylethene matches: {:>8d}\n'.format(len(matches_1d)))
    logfile.write('Number of Azobenzene matches: {:>8d}\n'.format(len(matches_1a)))
    
    smiles_D, smiles_A=[],[]
    for molecule in matches_1d:
        smiles=Chem.MolToSmiles(molecule)
        smiles_D.append(smiles)
    for molecule in matches_1a:
        smiles=Chem.MolToSmiles(molecule)
        smiles_A.append(smiles)
    all_DataFrame_D=all_DataFrame_D+smiles_D
    all_DataFrame_A=all_DataFrame_A+smiles_A
    all_id_D = id_final1 + all_id_D
    all_id_A = id_final2 + all_id_A
    end = time.time()
    print('Finished, run time: {:>10.3f}s'.format(end - start))
all_id_D = pd.DataFrame(all_id_D)
all_id_A = pd.DataFrame(all_id_A)
all_DataFrame_A=pd.DataFrame(all_DataFrame_A)
all_DataFrame_D=pd.DataFrame(all_DataFrame_D)

Final_D = pd.concat([all_DataFrame_D,all_id_D],axis = 1)
Final_A = pd.concat([all_DataFrame_A,all_id_A],axis = 1)
Final_A.to_csv('MolPort_A_batch_4.csv')
Final_D.to_csv('MolPort_D_batch_4.csv')
logfile.close()


Batch no. 0  Finished, run time:     39.239s
Batch no. 1  Finished, run time:     44.781s
Batch no. 2  Finished, run time:     51.978s
Batch no. 3  Finished, run time:     53.088s
Batch no. 4  Finished, run time:     56.632s
Batch no. 5  Finished, run time:     53.822s
Batch no. 6  Finished, run time:     58.106s
Batch no. 7  Finished, run time:     54.003s
Batch no. 8  Finished, run time:     50.487s
Batch no. 9  Finished, run time:     48.898s
Batch no. 10  Finished, run time:     55.171s
Batch no. 11  Finished, run time:     53.293s
Batch no. 12  Finished, run time:     65.908s
Batch no. 13  Finished, run time:     68.302s
Batch no. 14  Finished, run time:     67.111s
Batch no. 15  Finished, run time:     68.122s
Batch no. 16  Finished, run time:    113.924s
Batch no. 17  Finished, run time:    144.987s
Batch no. 18  Finished, run time:    153.837s
Batch no. 19  Finished, run time:    151.933s
Batch no. 20  Finished, run time:    147.298s
Batch no. 21  Finished, run time:    141.199

In [5]:
data_full5 = pd.read_csv('fulldb5.txt', sep="	", header=0)
print(len(data_full5))
samplenum=len(data_full5)//samplesize+1

5000009


In [6]:
all_DataFrame_A=[]
all_DataFrame_D=[]
all_id_D = []
all_id_A = []
logfile=open('logfile.txt','w')
for i in range(samplenum):
    if i==samplenum-1:
        data_segment_1=data_full5.loc[samplesize*i:]
    else:
        data_segment_1 = data_full5.loc[samplesize*i:samplesize*(i+1)]
    start = time.time()
    
    print('Batch no. {:d}'.format(i), end="  ")

    suppl = data_segment_1['SMILES'].tolist()
    supp2 = data_segment_1['MOLPORTID'].tolist()
    ms = []
    for smi in suppl:
        try: 
            ms.append(Chem.MolFromSmiles(smi))
        except:
            ms.append(None)
    ms_filter = list(filter(None, ms)) 
    id_filter = list(filter(None, supp2))
    indices1 = []
    indices2 = []
    matches_1d = []
    matches_1a = []
    
    for i,mol in enumerate(ms_filter):
        if mol.HasSubstructMatch(patt1):
            matches_1d.append(mol)
            indices1.append(i)
        if mol.HasSubstructMatch(patt2):
            matches_1a.append(mol)
            indices2.append(i)
    id_final1 = []
    id_final2 = []
    for i in indices1:
        id_final1.append(id_filter[i])
    for i in indices2:
        id_final2.append(id_filter[i])

    
    logfile.write('Batch number {:>20d}\n'.format(i))
    logfile.write('Number of Diarylethene matches: {:>8d}\n'.format(len(matches_1d)))
    logfile.write('Number of Azobenzene matches: {:>8d}\n'.format(len(matches_1a)))
    
    smiles_D, smiles_A=[],[]
    for molecule in matches_1d:
        smiles=Chem.MolToSmiles(molecule)
        smiles_D.append(smiles)
    for molecule in matches_1a:
        smiles=Chem.MolToSmiles(molecule)
        smiles_A.append(smiles)
    all_DataFrame_D=all_DataFrame_D+smiles_D
    all_DataFrame_A=all_DataFrame_A+smiles_A
    all_id_D = id_final1 + all_id_D
    all_id_A = id_final2 + all_id_A
    end = time.time()
    print('Finished, run time: {:>10.3f}s'.format(end - start))
all_id_D = pd.DataFrame(all_id_D)
all_id_A = pd.DataFrame(all_id_A)
all_DataFrame_A=pd.DataFrame(all_DataFrame_A)
all_DataFrame_D=pd.DataFrame(all_DataFrame_D)

Final_D = pd.concat([all_DataFrame_D,all_id_D],axis = 1)
Final_A = pd.concat([all_DataFrame_A,all_id_A],axis = 1)
Final_A.to_csv('MolPort_A_batch_5.csv')
Final_D.to_csv('MolPort_D_batch_5.csv')
logfile.close()


Batch no. 0  Finished, run time:     18.407s
Batch no. 1  Finished, run time:     24.120s
Batch no. 2  Finished, run time:     20.466s
Batch no. 3  Finished, run time:     19.782s
Batch no. 4  Finished, run time:     19.768s
Batch no. 5  Finished, run time:     20.201s
Batch no. 6  Finished, run time:     20.078s
Batch no. 7  Finished, run time:     23.644s
Batch no. 8  Finished, run time:     21.748s
Batch no. 9  Finished, run time:     20.850s
Batch no. 10  Finished, run time:     27.520s
Batch no. 11  Finished, run time:     25.860s
Batch no. 12  Finished, run time:     26.311s
Batch no. 13  Finished, run time:     23.215s
Batch no. 14  Finished, run time:     22.228s
Batch no. 15  Finished, run time:     21.201s
Batch no. 16  Finished, run time:     23.210s
Batch no. 17  Finished, run time:     22.102s
Batch no. 18  Finished, run time:     22.856s
Batch no. 19  Finished, run time:     25.998s
Batch no. 20  Finished, run time:     26.438s
Batch no. 21  Finished, run time:     25.257

In [7]:
data_full6 = pd.read_csv('fulldb6.txt', sep="	", header=0)
print(len(data_full6))
samplenum=len(data_full6)//samplesize+1

5000015


In [8]:
all_DataFrame_A=[]
all_DataFrame_D=[]
all_id_D = []
all_id_A = []
logfile=open('logfile.txt','w')
for i in range(samplenum):
    if i==samplenum-1:
        data_segment_1=data_full6.loc[samplesize*i:]
    else:
        data_segment_1 = data_full6.loc[samplesize*i:samplesize*(i+1)]
    start = time.time()
    
    print('Batch no. {:d}'.format(i), end="  ")

    suppl = data_segment_1['SMILES'].tolist()
    supp2 = data_segment_1['MOLPORTID'].tolist()
    ms = []
    for smi in suppl:
        try: 
            ms.append(Chem.MolFromSmiles(smi))
        except:
            ms.append(None)
    ms_filter = list(filter(None, ms)) 
    id_filter = list(filter(None, supp2))
    indices1 = []
    indices2 = []
    matches_1d = []
    matches_1a = []
    
    for i,mol in enumerate(ms_filter):
        if mol.HasSubstructMatch(patt1):
            matches_1d.append(mol)
            indices1.append(i)
        if mol.HasSubstructMatch(patt2):
            matches_1a.append(mol)
            indices2.append(i)
    id_final1 = []
    id_final2 = []
    for i in indices1:
        id_final1.append(id_filter[i])
    for i in indices2:
        id_final2.append(id_filter[i])

    
    logfile.write('Batch number {:>20d}\n'.format(i))
    logfile.write('Number of Diarylethene matches: {:>8d}\n'.format(len(matches_1d)))
    logfile.write('Number of Azobenzene matches: {:>8d}\n'.format(len(matches_1a)))
    
    smiles_D, smiles_A=[],[]
    for molecule in matches_1d:
        smiles=Chem.MolToSmiles(molecule)
        smiles_D.append(smiles)
    for molecule in matches_1a:
        smiles=Chem.MolToSmiles(molecule)
        smiles_A.append(smiles)
    all_DataFrame_D=all_DataFrame_D+smiles_D
    all_DataFrame_A=all_DataFrame_A+smiles_A
    all_id_D = id_final1 + all_id_D
    all_id_A = id_final2 + all_id_A
    end = time.time()
    print('Finished, run time: {:>10.3f}s'.format(end - start))
all_id_D = pd.DataFrame(all_id_D)
all_id_A = pd.DataFrame(all_id_A)
all_DataFrame_A=pd.DataFrame(all_DataFrame_A)
all_DataFrame_D=pd.DataFrame(all_DataFrame_D)

Final_D = pd.concat([all_DataFrame_D,all_id_D],axis = 1)
Final_A = pd.concat([all_DataFrame_A,all_id_A],axis = 1)
Final_A.to_csv('MolPort_A_batch_6.csv')
Final_D.to_csv('MolPort_D_batch_6.csv')
logfile.close()


Batch no. 0  Finished, run time:     21.541s
Batch no. 1  Finished, run time:     27.504s
Batch no. 2  Finished, run time:     29.427s
Batch no. 3  Finished, run time:     28.736s
Batch no. 4  Finished, run time:     32.935s
Batch no. 5  Finished, run time:     30.012s
Batch no. 6  Finished, run time:     28.002s
Batch no. 7  Finished, run time:     26.532s
Batch no. 8  Finished, run time:     27.433s
Batch no. 9  Finished, run time:     25.631s
Batch no. 10  Finished, run time:     26.229s
Batch no. 11  Finished, run time:     25.238s
Batch no. 12  Finished, run time:     27.724s
Batch no. 13  Finished, run time:     27.439s
Batch no. 14  Finished, run time:     29.406s
Batch no. 15  Finished, run time:     26.420s
Batch no. 16  Finished, run time:     27.972s
Batch no. 17  Finished, run time:     26.885s
Batch no. 18  Finished, run time:     29.449s
Batch no. 19  Finished, run time:     33.105s
Batch no. 20  Finished, run time:     31.245s
Batch no. 21  Finished, run time:     43.841

In [9]:
data_full7 = pd.read_csv('fulldb7.txt', sep="	", header=0)
print(len(data_full7))
samplenum=len(data_full7)//samplesize+1

5000006


In [10]:
all_DataFrame_A=[]
all_DataFrame_D=[]
all_id_D = []
all_id_A = []
logfile=open('logfile.txt','w')
for i in range(samplenum):
    if i==samplenum-1:
        data_segment_1=data_full7.loc[samplesize*i:]
    else:
        data_segment_1 = data_full7.loc[samplesize*i:samplesize*(i+1)]
    start = time.time()
    
    print('Batch no. {:d}'.format(i), end="  ")

    suppl = data_segment_1['SMILES'].tolist()
    supp2 = data_segment_1['MOLPORTID'].tolist()
    ms = []
    for smi in suppl:
        try: 
            ms.append(Chem.MolFromSmiles(smi))
        except:
            ms.append(None)
    ms_filter = list(filter(None, ms)) 
    id_filter = list(filter(None, supp2))
    indices1 = []
    indices2 = []
    matches_1d = []
    matches_1a = []
    
    for i,mol in enumerate(ms_filter):
        if mol.HasSubstructMatch(patt1):
            matches_1d.append(mol)
            indices1.append(i)
        if mol.HasSubstructMatch(patt2):
            matches_1a.append(mol)
            indices2.append(i)
    id_final1 = []
    id_final2 = []
    for i in indices1:
        id_final1.append(id_filter[i])
    for i in indices2:
        id_final2.append(id_filter[i])

    
    logfile.write('Batch number {:>20d}\n'.format(i))
    logfile.write('Number of Diarylethene matches: {:>8d}\n'.format(len(matches_1d)))
    logfile.write('Number of Azobenzene matches: {:>8d}\n'.format(len(matches_1a)))
    
    smiles_D, smiles_A=[],[]
    for molecule in matches_1d:
        smiles=Chem.MolToSmiles(molecule)
        smiles_D.append(smiles)
    for molecule in matches_1a:
        smiles=Chem.MolToSmiles(molecule)
        smiles_A.append(smiles)
    all_DataFrame_D=all_DataFrame_D+smiles_D
    all_DataFrame_A=all_DataFrame_A+smiles_A
    all_id_D = id_final1 + all_id_D
    all_id_A = id_final2 + all_id_A
    end = time.time()
    print('Finished, run time: {:>10.3f}s'.format(end - start))
all_id_D = pd.DataFrame(all_id_D)
all_id_A = pd.DataFrame(all_id_A)
all_DataFrame_A=pd.DataFrame(all_DataFrame_A)
all_DataFrame_D=pd.DataFrame(all_DataFrame_D)

Final_D = pd.concat([all_DataFrame_D,all_id_D],axis = 1)
Final_A = pd.concat([all_DataFrame_A,all_id_A],axis = 1)
Final_A.to_csv('MolPort_A_batch_7.csv')
Final_D.to_csv('MolPort_D_batch_7.csv')
logfile.close()


Batch no. 0  Finished, run time:     20.221s
Batch no. 1  Finished, run time:     23.214s
Batch no. 2  Finished, run time:     25.672s
Batch no. 3  Finished, run time:     24.873s
Batch no. 4  Finished, run time:     27.381s
Batch no. 5  Finished, run time:     25.760s
Batch no. 6  Finished, run time:     27.206s
Batch no. 7  Finished, run time:     25.548s
Batch no. 8  Finished, run time:     28.094s
Batch no. 9  Finished, run time:     31.482s
Batch no. 10  Finished, run time:     32.561s
Batch no. 11  Finished, run time:     28.706s
Batch no. 12  Finished, run time:     30.434s
Batch no. 13  Finished, run time:     28.790s
Batch no. 14  Finished, run time:     29.739s
Batch no. 15  Finished, run time:     27.670s
Batch no. 16  Finished, run time:     29.639s
Batch no. 17  Finished, run time:     28.043s
Batch no. 18  Finished, run time:     29.666s
Batch no. 19  Finished, run time:     30.161s
Batch no. 20  Finished, run time:     30.423s
Batch no. 21  Finished, run time:     30.120

In [11]:
data_full8 = pd.read_csv('fulldb8.txt', sep="	", header=0)
print(len(data_full8))
samplenum=len(data_full8)//samplesize+1

5000003


In [12]:
all_DataFrame_A=[]
all_DataFrame_D=[]
all_id_D = []
all_id_A = []
logfile=open('logfile.txt','w')
for i in range(samplenum):
    if i==samplenum-1:
        data_segment_1=data_full8.loc[samplesize*i:]
    else:
        data_segment_1 = data_full8.loc[samplesize*i:samplesize*(i+1)]
    start = time.time()
    
    print('Batch no. {:d}'.format(i), end="  ")

    suppl = data_segment_1['SMILES'].tolist()
    supp2 = data_segment_1['MOLPORTID'].tolist()
    ms = []
    for smi in suppl:
        try: 
            ms.append(Chem.MolFromSmiles(smi))
        except:
            ms.append(None)
    ms_filter = list(filter(None, ms)) 
    id_filter = list(filter(None, supp2))
    indices1 = []
    indices2 = []
    matches_1d = []
    matches_1a = []
    
    for i,mol in enumerate(ms_filter):
        if mol.HasSubstructMatch(patt1):
            matches_1d.append(mol)
            indices1.append(i)
        if mol.HasSubstructMatch(patt2):
            matches_1a.append(mol)
            indices2.append(i)
    id_final1 = []
    id_final2 = []
    for i in indices1:
        id_final1.append(id_filter[i])
    for i in indices2:
        id_final2.append(id_filter[i])

    
    logfile.write('Batch number {:>20d}\n'.format(i))
    logfile.write('Number of Diarylethene matches: {:>8d}\n'.format(len(matches_1d)))
    logfile.write('Number of Azobenzene matches: {:>8d}\n'.format(len(matches_1a)))
    
    smiles_D, smiles_A=[],[]
    for molecule in matches_1d:
        smiles=Chem.MolToSmiles(molecule)
        smiles_D.append(smiles)
    for molecule in matches_1a:
        smiles=Chem.MolToSmiles(molecule)
        smiles_A.append(smiles)
    all_DataFrame_D=all_DataFrame_D+smiles_D
    all_DataFrame_A=all_DataFrame_A+smiles_A
    all_id_D = id_final1 + all_id_D
    all_id_A = id_final2 + all_id_A
    end = time.time()
    print('Finished, run time: {:>10.3f}s'.format(end - start))
all_id_D = pd.DataFrame(all_id_D)
all_id_A = pd.DataFrame(all_id_A)
all_DataFrame_A=pd.DataFrame(all_DataFrame_A)
all_DataFrame_D=pd.DataFrame(all_DataFrame_D)

Final_D = pd.concat([all_DataFrame_D,all_id_D],axis = 1)
Final_A = pd.concat([all_DataFrame_A,all_id_A],axis = 1)
Final_A.to_csv('MolPort_A_batch_8.csv')
Final_D.to_csv('MolPort_D_batch_8.csv')
logfile.close()


Batch no. 0  Finished, run time:     30.049s
Batch no. 1  Finished, run time:     35.141s
Batch no. 2  Finished, run time:     26.365s
Batch no. 3  Finished, run time:     25.997s
Batch no. 4  Finished, run time:     27.395s
Batch no. 5  Finished, run time:     25.465s
Batch no. 6  Finished, run time:     40.140s
Batch no. 7  Finished, run time:     49.591s
Batch no. 8  Finished, run time:     71.789s
Batch no. 9  Finished, run time:     59.332s
Batch no. 10  Finished, run time:     48.501s
Batch no. 11  Finished, run time:     49.632s
Batch no. 12  Finished, run time:     48.630s
Batch no. 13  Finished, run time:     49.738s
Batch no. 14  Finished, run time:     48.641s
Batch no. 15  Finished, run time:     42.747s
Batch no. 16  Finished, run time:     42.158s
Batch no. 17  Finished, run time:     43.896s
Batch no. 18  Finished, run time:     40.294s
Batch no. 19  Finished, run time:     31.058s
Batch no. 20  Finished, run time:     31.788s
Batch no. 21  Finished, run time:     28.805

In [13]:
data_full9 = pd.read_csv('fulldb9.txt', sep="	", header=0)
print(len(data_full9))
samplesize = 100000
samplenum=len(data_full9)//samplesize+1

4999925


In [14]:
all_DataFrame_A=[]
all_DataFrame_D=[]
all_id_D = []
all_id_A = []
logfile=open('logfile.txt','w')
for i in range(samplenum):
    if i==samplenum-1:
        data_segment_1=data_full9.loc[samplesize*i:]
    else:
        data_segment_1 = data_full9.loc[samplesize*i:samplesize*(i+1)]
    start = time.time()
    
    print('Batch no. {:d}'.format(i), end="  ")

    suppl = data_segment_1['SMILES'].tolist()
    supp2 = data_segment_1['MOLPORTID'].tolist()
    ms = []
    for smi in suppl:
        try: 
            ms.append(Chem.MolFromSmiles(smi))
        except:
            ms.append(None)
    ms_filter = list(filter(None, ms)) 
    id_filter = list(filter(None, supp2))
    indices1 = []
    indices2 = []
    matches_1d = []
    matches_1a = []
    
    for i,mol in enumerate(ms_filter):
        if mol.HasSubstructMatch(patt1):
            matches_1d.append(mol)
            indices1.append(i)
        if mol.HasSubstructMatch(patt2):
            matches_1a.append(mol)
            indices2.append(i)
    id_final1 = []
    id_final2 = []
    for i in indices1:
        id_final1.append(id_filter[i])
    for i in indices2:
        id_final2.append(id_filter[i])

    
    logfile.write('Batch number {:>20d}\n'.format(i))
    logfile.write('Number of Diarylethene matches: {:>8d}\n'.format(len(matches_1d)))
    logfile.write('Number of Azobenzene matches: {:>8d}\n'.format(len(matches_1a)))
    
    smiles_D, smiles_A=[],[]
    for molecule in matches_1d:
        smiles=Chem.MolToSmiles(molecule)
        smiles_D.append(smiles)
    for molecule in matches_1a:
        smiles=Chem.MolToSmiles(molecule)
        smiles_A.append(smiles)
    all_DataFrame_D=all_DataFrame_D+smiles_D
    all_DataFrame_A=all_DataFrame_A+smiles_A
    all_id_D = id_final1 + all_id_D
    all_id_A = id_final2 + all_id_A
    end = time.time()
    print('Finished, run time: {:>10.3f}s'.format(end - start))
all_id_D = pd.DataFrame(all_id_D)
all_id_A = pd.DataFrame(all_id_A)
all_DataFrame_A=pd.DataFrame(all_DataFrame_A)
all_DataFrame_D=pd.DataFrame(all_DataFrame_D)

Final_D = pd.concat([all_DataFrame_D,all_id_D],axis = 1)
Final_A = pd.concat([all_DataFrame_A,all_id_A],axis = 1)
Final_A.to_csv('MolPort_A_batch_9.csv')
Final_D.to_csv('MolPort_D_batch_9.csv')
logfile.close()


Batch no. 0  Finished, run time:     33.408s
Batch no. 1  Finished, run time:     23.515s
Batch no. 2  Finished, run time:     26.652s
Batch no. 3  Finished, run time:     33.341s
Batch no. 4  Finished, run time:     40.455s
Batch no. 5  Finished, run time:     43.837s
Batch no. 6  Finished, run time:     44.395s
Batch no. 7  Finished, run time:     38.332s
Batch no. 8  Finished, run time:     38.122s
Batch no. 9  Finished, run time:     37.864s
Batch no. 10  Finished, run time:     45.618s
Batch no. 11  Finished, run time:     44.543s
Batch no. 12  Finished, run time:     39.399s
Batch no. 13  Finished, run time:     39.952s
Batch no. 14  Finished, run time:     43.523s
Batch no. 15  Finished, run time:     40.567s
Batch no. 16  Finished, run time:     41.717s
Batch no. 17  Finished, run time:     39.793s
Batch no. 18  Finished, run time:     43.432s
Batch no. 19  Finished, run time:     38.062s
Batch no. 20  Finished, run time:     35.820s
Batch no. 21  Finished, run time:     35.748

In [15]:
data_full10 = pd.read_csv('fulldb10.txt', sep="	", header=0)
print(len(data_full10))
samplenum=len(data_full10)//samplesize+1

2196566


In [16]:
all_DataFrame_A=[]
all_DataFrame_D=[]
all_id_D = []
all_id_A = []
logfile=open('logfile.txt','w')
for i in range(samplenum):
    if i==samplenum-1:
        data_segment_1=data_full10.loc[samplesize*i:]
    else:
        data_segment_1 = data_full10.loc[samplesize*i:samplesize*(i+1)]
    start = time.time()
    
    print('Batch no. {:d}'.format(i), end="  ")

    suppl = data_segment_1['SMILES'].tolist()
    supp2 = data_segment_1['MOLPORTID'].tolist()
    ms = []
    for smi in suppl:
        try: 
            ms.append(Chem.MolFromSmiles(smi))
        except:
            ms.append(None)
    ms_filter = list(filter(None, ms)) 
    id_filter = list(filter(None, supp2))
    indices1 = []
    indices2 = []
    matches_1d = []
    matches_1a = []
    
    for i,mol in enumerate(ms_filter):
        if mol.HasSubstructMatch(patt1):
            matches_1d.append(mol)
            indices1.append(i)
        if mol.HasSubstructMatch(patt2):
            matches_1a.append(mol)
            indices2.append(i)
    id_final1 = []
    id_final2 = []
    for i in indices1:
        id_final1.append(id_filter[i])
    for i in indices2:
        id_final2.append(id_filter[i])

    
    logfile.write('Batch number {:>20d}\n'.format(i))
    logfile.write('Number of Diarylethene matches: {:>8d}\n'.format(len(matches_1d)))
    logfile.write('Number of Azobenzene matches: {:>8d}\n'.format(len(matches_1a)))
    
    smiles_D, smiles_A=[],[]
    for molecule in matches_1d:
        smiles=Chem.MolToSmiles(molecule)
        smiles_D.append(smiles)
    for molecule in matches_1a:
        smiles=Chem.MolToSmiles(molecule)
        smiles_A.append(smiles)
    all_DataFrame_D=all_DataFrame_D+smiles_D
    all_DataFrame_A=all_DataFrame_A+smiles_A
    all_id_D = id_final1 + all_id_D
    all_id_A = id_final2 + all_id_A
    end = time.time()
    print('Finished, run time: {:>10.3f}s'.format(end - start))
all_id_D = pd.DataFrame(all_id_D)
all_id_A = pd.DataFrame(all_id_A)
all_DataFrame_A=pd.DataFrame(all_DataFrame_A)
all_DataFrame_D=pd.DataFrame(all_DataFrame_D)

Final_D = pd.concat([all_DataFrame_D,all_id_D],axis = 1)
Final_A = pd.concat([all_DataFrame_A,all_id_A],axis = 1)
Final_A.to_csv('MolPort_A_batch_10.csv')
Final_D.to_csv('MolPort_D_batch_10.csv')
logfile.close()


Batch no. 0  Finished, run time:     57.901s
Batch no. 1  Finished, run time:     56.886s
Batch no. 2  Finished, run time:     46.226s
Batch no. 3  Finished, run time:     48.042s
Batch no. 4  Finished, run time:     34.627s
Batch no. 5  Finished, run time:     27.470s
Batch no. 6  Finished, run time:     35.143s
Batch no. 7  Finished, run time:     44.860s
Batch no. 8  Finished, run time:     51.603s
Batch no. 9  Finished, run time:     55.929s
Batch no. 10  Finished, run time:     55.035s
Batch no. 11  Finished, run time:     53.379s
Batch no. 12  Finished, run time:     65.453s
Batch no. 13  Finished, run time:     71.998s
Batch no. 14  Finished, run time:     66.664s
Batch no. 15  Finished, run time:     57.682s
Batch no. 16  Finished, run time:     49.120s
Batch no. 17  Finished, run time:     54.983s
Batch no. 18  Finished, run time:     55.183s
Batch no. 19  Finished, run time:     58.971s
Batch no. 20  Finished, run time:     53.234s
Batch no. 21  Finished, run time:     49.017

In [3]:
A1 = pd.read_csv("MolPort_A_batch_1.csv",sep=',',header = 0)
A1_smi = A1.iloc[:,1].tolist()
A1_id = A1.iloc[:,2].tolist()
A2 = pd.read_csv("MolPort_A_batch_2.csv",sep=',',header = 0)
A2_smi = A2.iloc[:,1].tolist()
A2_id = A2.iloc[:,2].tolist()
A3 = pd.read_csv("MolPort_A_batch_3.csv",sep=',',header = 0)
A3_smi = A3.iloc[:,1].tolist()
A3_id = A3.iloc[:,2].tolist()
A4 = pd.read_csv("MolPort_A_batch_4.csv",sep=',',header = 0)
A4_smi = A4.iloc[:,1].tolist()
A4_id = A4.iloc[:,2].tolist()
A5 = pd.read_csv("MolPort_A_batch_5.csv",sep=',',header = 0)
A5_smi = A5.iloc[:,1].tolist()
A5_id = A5.iloc[:,2].tolist()
A6 = pd.read_csv("MolPort_A_batch_6.csv",sep=',',header = 0)
A6_smi = A6.iloc[:,1].tolist()
A6_id = A6.iloc[:,2].tolist()
A7 = pd.read_csv("MolPort_A_batch_7.csv",sep=',',header = 0)
A7_smi = A7.iloc[:,1].tolist()
A7_id = A7.iloc[:,2].tolist()
A8 = pd.read_csv("MolPort_A_batch_8.csv",sep=',',header = 0)
A8_smi = A8.iloc[:,1].tolist()
A8_id = A8.iloc[:,2].tolist()
A9 = pd.read_csv("MolPort_A_batch_9.csv",sep=',',header = 0)
A9_smi = A9.iloc[:,1].tolist()
A9_id = A9.iloc[:,2].tolist()
A10 = pd.read_csv("MolPort_A_batch_10.csv",sep=',',header = 0)
A10_smi = A10.iloc[:,1].tolist()
A10_id = A10.iloc[:,2].tolist()

In [4]:
All_A_smi = A1_smi+A2_smi+A3_smi+A4_smi+A5_smi+A6_smi+A7_smi+A8_smi+A9_smi+A10_smi
All_A_id = A1_id+A2_id+A3_id+A4_id+A5_id+A6_id+A7_id+A8_id+A9_id+A10_id
All_A_smi_frame = pd.DataFrame(All_A_smi)
All_A_id_frame = pd.DataFrame(All_A_id)
Final_A = pd.concat([All_A_smi_frame,All_A_id_frame],axis = 1)
Final_A.to_csv('Final_Azobenzenes.csv')#Combine the filter results

In [5]:
A1 = pd.read_csv("MolPort_D_batch_1.csv",sep=',',header = 0)
A1_smi = A1.iloc[:,1].tolist()
A1_id = A1.iloc[:,2].tolist()
A2 = pd.read_csv("MolPort_D_batch_2.csv",sep=',',header = 0)
A2_smi = A2.iloc[:,1].tolist()
A2_id = A2.iloc[:,2].tolist()
A3 = pd.read_csv("MolPort_D_batch_3.csv",sep=',',header = 0)
A3_smi = A3.iloc[:,1].tolist()
A3_id = A3.iloc[:,2].tolist()
A4 = pd.read_csv("MolPort_D_batch_4.csv",sep=',',header = 0)
A4_smi = A4.iloc[:,1].tolist()
A4_id = A4.iloc[:,2].tolist()
A5 = pd.read_csv("MolPort_D_batch_5.csv",sep=',',header = 0)
A5_smi = A5.iloc[:,1].tolist()
A5_id = A5.iloc[:,2].tolist()
A6 = pd.read_csv("MolPort_D_batch_6.csv",sep=',',header = 0)
A6_smi = A6.iloc[:,1].tolist()
A6_id = A6.iloc[:,2].tolist()
A7 = pd.read_csv("MolPort_D_batch_7.csv",sep=',',header = 0)
A7_smi = A7.iloc[:,1].tolist()
A7_id = A7.iloc[:,2].tolist()
A8 = pd.read_csv("MolPort_D_batch_8.csv",sep=',',header = 0)
A8_smi = A8.iloc[:,1].tolist()
A8_id = A8.iloc[:,2].tolist()
A9 = pd.read_csv("MolPort_D_batch_9.csv",sep=',',header = 0)
A9_smi = A9.iloc[:,1].tolist()
A9_id = A9.iloc[:,2].tolist()
A10 = pd.read_csv("MolPort_D_batch_10.csv",sep=',',header = 0)
A10_smi = A10.iloc[:,1].tolist()
A10_id = A10.iloc[:,2].tolist()

In [7]:
All_A_smi = A1_smi+A2_smi+A3_smi+A4_smi+A5_smi+A6_smi+A7_smi+A8_smi+A9_smi+A10_smi
All_A_id = A1_id+A2_id+A3_id+A4_id+A5_id+A6_id+A7_id+A8_id+A9_id+A10_id
All_A_smi_frame = pd.DataFrame(All_A_smi)
All_A_id_frame = pd.DataFrame(All_A_id)
Final_A = pd.concat([All_A_smi_frame,All_A_id_frame],axis = 1)
Final_A.to_csv('Final_Diarylethenes.csv')

In [None]:
emol = pd.read_csv("eMol_All_A_batch.csv",sep=',',header = 0)

In [44]:
#Drop duplicates
import pandas as pd
A_df = pd.read_csv("eMol_D_batch.csv", sep=',', header=0) 
Azobenzenes_list = A_df.iloc[:,1].tolist()
Final_Azobenzenes_list = list(set(Azobenzenes_list))
#Final_Azobenzenes = pd.DataFrame(Final_Azobenzenes_list)
#Final_Azobenzenes.to_csv('Final_Azobenzenes.csv')

In [45]:
len(Azobenzenes_list)

80197

In [46]:
len(Final_Azobenzenes_list)

80188

In [47]:
#Add id
idlist = A_df.iloc[:,2].tolist()
indices_a=[]
ID=[]
for i,item in enumerate(Final_Azobenzenes_list):
    if item in Azobenzenes_list:
        indices_a.append(i)
        
for i in indices_a:
    ID.append(idlist[i])

In [48]:
len(ID)

80188

In [49]:
Azo_MP1 = pd.DataFrame(ID,columns=['eMolecules ID'])
Azo_MP2 = pd.DataFrame(Final_Azobenzenes_list,columns=['Cannonical Smiles'])
Azo_MP = pd.concat([Azo_MP1, Azo_MP2], axis=1)
Azo_MP.to_csv('DIAR_EM.csv')

In [31]:
D_df = pd.read_csv("All_D_batch.csv", sep=',', header=0) 
Diarylethenes_list = D_df.iloc[:,2].tolist()
Final_Diarylethenes_list = list(set(Diarylethenes_list))
Final_Diarylethenes = pd.DataFrame(Final_Diarylethenes_list)
Final_Diarylethenes.to_csv('Final_Diarylethenes.csv')

In [2]:
allsmilesmolport = pd.read_csv("All_A_batch.csv", sep=',', header=0)
allsmilesemolecules = pd.read_csv("All_D_batch.csv", sep=',', header=0)

In [None]:
data1=pd.read_csv('fulldb1.txt', sep="	", header=0)
data2=pd.read_csv('fulldb2.txt', sep="	", header=0)
data3=pd.read_csv('fulldb3.txt', sep="	", header=0)
data4=pd.read_csv('fulldb4.txt', sep="	", header=0)
data5=pd.read_csv('fulldb5.txt', sep="	", header=0)
data6=pd.read_csv('fulldb6.txt', sep="	", header=0)
data7=pd.read_csv('fulldb7.txt', sep="	", header=0)
data8=pd.read_csv('fulldb8.txt', sep="	", header=0)
data9=pd.read_csv('fulldb9.txt', sep="	", header=0)
data10=pd.read_csv('fulldb10.txt', sep="	", header=0)

data_full_eMolecules = pd.read_csv('version.smi.txt', sep=" ", header=0)

In [None]:
allsmilesmolport_list = allsmilesmolport.iloc[:,1].tolist()
ID1=data1['MOLPORTID'].tolist()
ID2=data2['MOLPORTID'].tolist()
ID3=data3['MOLPORTID'].tolist()
ID4=data4['MOLPORTID'].tolist()
ID5=data5['MOLPORTID'].tolist()
ID6=data6['MOLPORTID'].tolist()
ID7=data7['MOLPORTID'].tolist()
ID8=data8['MOLPORTID'].tolist()
ID9=data9['MOLPORTID'].tolist()
ID10=data10['MOLPORTID'].tolist()
allIDmolport = ID1+ID2+ID3+ID4+ID5+ID6+ID7+ID8+ID9+ID10

allsmilesemolecules_list = allsmilesemolecules.iloc[:,1].tolist()
allIDemolecules = data_full_eMolecules['parent_id'].tolist()