# Molecule processing
**In this notebook, the processing of the molecules will be carried out in order to obtain a
similarity table of all the molecules belonging to the chosen data set.**


## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import sys
import threading
import os

from io import StringIO
from drugSimilarityTable import SimilarityTable
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

## Load the CSV File
Read the complete file that will be used to perform the similarity comparisons

In [2]:
path_similarities = 'sanitize_smiles.csv'
columns_to_process = ['sanitize-id', 'smiles']
data_types_dictionary = {
    'sanitize-id': 'int',
    'smiles': 'str'
}

In [3]:
drugs_csv = pd.read_csv(path_similarities, usecols=columns_to_process, dtype=data_types_dictionary, sep=';')
drugs_csv

Unnamed: 0,sanitize-id,smiles
0,0,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl
1,1,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1
2,2,Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1
3,3,Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1
4,4,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1
...,...,...
19995,19995,CC(Cc1cccc2ccccc12)NCCCc1ccccc1
19996,19996,CC(C)C[C@H](NC(=O)[C@@H](S)Cc1ccccc1)C(=O)N[C@...
19997,19997,Nc1nc(Cl)c2ncn(C3CC(CO)C(O)C3O)c2n1
19998,19998,COCCOCO[C@H](NC(=O)C(F)(F)C(=O)C(Cc1ccc(OCc2cc...


### Handling errors from RDKit
Wrap the internal RDKit streams, so they go to python’s SysStdErr
Activate the SysStdErr output to catch the outputs from the RDkit operations

In [4]:
Chem.WrapLogs()
sio = sys.stderr = StringIO()
failures = []

## Data analysis

The null values ​​of all the data are verified, there would exist errors if any of the smiles values ​​of the compounds is null

In [5]:
drugs_csv.isnull().sum()

sanitize-id    0
smiles         0
dtype: int64

The type of data and the names of the tables are also verified.


In [6]:
drugs_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sanitize-id  20000 non-null  int64 
 1   smiles       20000 non-null  object
dtypes: int64(1), object(1)
memory usage: 312.6+ KB


## Calculation of the similarity table
This similarity table consists of the comparison of a molecule with all the rest of the molecules
that belong to the database, with the aim of obtaining the top of molecules that contain the most
similarity with that molecule.

### Chunk division
Split the main file into chunks to facilitate further processing through threads.

In [7]:
numThreads = 4
chunks_directory = 'sanitize_chunks'
chunk_size = 1000
path_creation_chunks = 'sanitize_chunks/chunk{}.csv'

In [8]:
for i, chunk in enumerate(
        pd.read_csv(path_similarities, usecols=columns_to_process, dtype=data_types_dictionary, sep=';',
                    chunksize=chunk_size)):
    chunk.to_csv(path_creation_chunks.format(i), index=False, sep=';')

### Parallel processing
Define the function to be executed by each thread within the parallel processing of the database chunks

To achieve the processing we must follow the following process:
1. First, the value of the molecules is obtained from their "SMILES" found in the database.
2. Then it is necessary to calculate the "fingerprints" of those molecules.
3. Finally, it is necessary to calculate the similarity between the molecules using their fingerprints.

In [9]:
def processChunkFiles(files, radius):
    for file in files:
        chunk_sanitize_csv = pd.read_csv(chunks_directory + '/' + file, usecols=columns_to_process,
                                         dtype=data_types_dictionary, sep=';')

        for drug in chunk_sanitize_csv.to_dict('records'):
            drug_id = drug['sanitize-id']
            drug_smiles = drug['smiles']

            drug_similarity_table = SimilarityTable(drug_id)
            drug_molecule = Chem.MolFromSmiles(drug_smiles)

            if drug_molecule is None:
                failures.append(drug_id, sio.getvalue())
                sio = stderr = StringIO()
                continue
            else:
                drug_fingerprint = AllChem.GetMorganFingerprint(drug_molecule, radius)

                for drug_to_test in drugs_csv.to_dict('records'):
                    drug_to_test_id = drug_to_test['sanitize-id']

                    if drug_to_test_id != drug_id:
                        drug_to_test_smiles = drug_to_test['smiles']
                        drug_to_test_molecule = Chem.MolFromSmiles(drug_to_test_smiles)

                        if drug_to_test_molecule is None:
                            continue
                        else:
                            drug_to_test_fingerprint = AllChem.GetMorganFingerprint(drug_to_test_molecule, radius)

                            drugs_similarity = DataStructs.TanimotoSimilarity(drug_fingerprint,
                                                                              drug_to_test_fingerprint)
                            drug_similarity_table.add_item(drug_to_test_id, drugs_similarity)

            drug_similarity_table.save_similarity_table(radius)


Load the name of the chunks files to process

In [10]:
chunk_files = os.listdir(chunks_directory)
chunk_files

['chunk0.csv',
 'chunk1.csv',
 'chunk2.csv',
 'chunk3.csv',
 'chunk4.csv',
 'chunk5.csv',
 'chunk6.csv',
 'chunk7.csv',
 'chunk8.csv',
 'chunk9.csv',
 'chunk10.csv',
 'chunk11.csv',
 'chunk12.csv',
 'chunk13.csv',
 'chunk14.csv',
 'chunk15.csv',
 'chunk16.csv',
 'chunk17.csv',
 'chunk18.csv',
 'chunk19.csv']

Split the list of files according to the number of threads to be handled in parallel processing

In [11]:
thread_files = np.array_split(chunk_files, numThreads)
thread_files

[array(['chunk0.csv', 'chunk1.csv', 'chunk2.csv', 'chunk3.csv',
        'chunk4.csv'], dtype='<U11'),
 array(['chunk5.csv', 'chunk6.csv', 'chunk7.csv', 'chunk8.csv',
        'chunk9.csv'], dtype='<U11'),
 array(['chunk10.csv', 'chunk11.csv', 'chunk12.csv', 'chunk13.csv',
        'chunk14.csv'], dtype='<U11'),
 array(['chunk15.csv', 'chunk16.csv', 'chunk17.csv', 'chunk18.csv',
        'chunk19.csv'], dtype='<U11')]

### Thread definition and initialization

In [12]:
threads = []

for i in range(numThreads):
    thread = threading.Thread(target=processChunkFiles, args=(thread_files[i], 2,))
    threads.append(thread)

In [13]:
for i in range(numThreads):
    threads[i].start()

In [None]:
for i in range(numThreads):
    threads[i].join()

Traceback (most recent call last):
  File "/home/jp/anaconda3/envs/my-rdkit-env/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_201659/1607152748.py", line 2, in <module>
    threads[i].join()
  File "/home/jp/anaconda3/envs/my-rdkit-env/lib/python3.9/threading.py", line 1053, in join
    self._wait_for_tstate_lock()
  File "/home/jp/anaconda3/envs/my-rdkit-env/lib/python3.9/threading.py", line 1073, in _wait_for_tstate_lock
    if lock.acquire(block, timeout):
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/jp/anaconda3/envs/my-rdkit-env/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 2064, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, a