# Project: Decoding Molecules From Fingerprints.
## Group Members:
### Qi Chen, e-mail: gusqichr@student.gu.se
### Nils Dunlop, e-mail: gusdunlni@student.gu.se
### Francisco Alejandro Erazo Piza, e-mail: guserafr@student.gu.se
***

In [1]:
import os
import pandas as pd
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem import PandasTools
import numpy as np
from joblib import Parallel, delayed
from rdkit.DataStructs import TanimotoSimilarity
import subprocess
from typing import List, Tuple
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed
from rdkit import Chem
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

### Define directories
***

In [2]:
# Set user directory
USER_DIR = os.path.expanduser('~')

# Set the project root directory
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Define the input and output directories
INPUT_DIR = os.path.join(PROJECT_ROOT, 'data/')
OUTPUT_DIR = os.path.join(PROJECT_ROOT, 'data/bit_flipping_nn')

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)


### COX2 and Janus Data
***

In [3]:
# Read COX2_SMILES.csv
cox2_df = pd.read_csv('../data/COX2_SMILES.csv', delimiter=';')
print(f"COX2 shape: {cox2_df.shape}")

# Read Janus_SMILES.csv
janus_df = pd.read_csv('../data/Janus_SMILES.csv', delimiter=';')
print(f"Janus shape: {janus_df.shape}")

COX2 shape: (21, 2)
Janus shape: (12, 2)


In [4]:
# Add molecule column to the dataframes
PandasTools.AddMoleculeColumnToFrame(cox2_df, smilesCol='SMILES', molCol='Molecule')
PandasTools.AddMoleculeColumnToFrame(janus_df, smilesCol='SMILES', molCol='Molecule')

In [5]:
cox2_df.head()

Unnamed: 0,title,SMILES,Molecule
0,Parecoxib,CCC(=O)NS(=O)(=O)c1ccc(cc1)c2c(C)onc2c3ccccc3,<rdkit.Chem.rdchem.Mol object at 0x7fbb264ad4d0>
1,Anitrazafen,COc1ccc(cc1)c2nnc(C)nc2c3ccc(OC)cc3,<rdkit.Chem.rdchem.Mol object at 0x7fbb264ad5b0>
2,Celecoxib,Cc1ccc(cc1)c2cc(nn2c3ccc(cc3)S(=O)(=O)N)C(F)(F)F,<rdkit.Chem.rdchem.Mol object at 0x7fbb264ad620>
3,Cimicoxib,COc1ccc(cc1F)c2c(Cl)ncn2c3ccc(cc3)S(=O)(=O)N,<rdkit.Chem.rdchem.Mol object at 0x7fbb264ad690>
4,Deracoxib,COc1ccc(cc1F)c2cc(nn2c3ccc(cc3)S(=O)(=O)N)C(F)F,<rdkit.Chem.rdchem.Mol object at 0x7fbb264ad700>


In [6]:
janus_df.head()

Unnamed: 0,title,SMILES,Molecule
0,Tofacitinib,C[C@@H]1CCN(C[C@@H]1N(C)c2ncnc3[nH]ccc23)C(=O)CC#N,<rdkit.Chem.rdchem.Mol object at 0x7fbb264adf50>
1,Ruxolitinib,N#CC[C@H](C1CCCC1)n2cc(cn2)c3ncnc4[nH]ccc34,<rdkit.Chem.rdchem.Mol object at 0x7fbb264adfc0>
2,Oclacitinib,CNS(=O)(=O)C[C@@H]1CC[C@H](CC1)N(C)c2ncnc3[nH]ccc23,<rdkit.Chem.rdchem.Mol object at 0x7fbb264ae030>
3,Baricitinib,CCS(=O)(=O)N1CC(CC#N)(C1)n2cc(cn2)c3ncnc4[nH]ccc34,<rdkit.Chem.rdchem.Mol object at 0x7fbb264ae0a0>
4,Upadacitinib,CC[C@@H]1CN(C[C@@H]1c2cnc3cnc4[nH]ccc4n23)C(=O)NCC(F)(F)F,<rdkit.Chem.rdchem.Mol object at 0x7fbb264ae110>


In [7]:
def generate_fingerprint(mol):
    """
    Generate ECFP4 fingerprint and sparse representation for a molecule.
    """
    if mol is not None:
        morgan_generator = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
        fp = morgan_generator.GetFingerprint(mol)
        bit_vector = np.array(list(fp.ToBitString())).astype(int)
        sparse_representation = list(np.where(bit_vector == 1)[0])
        sparse_representation_str = ' '.join(map(str, sparse_representation))
        return bit_vector, sparse_representation_str
    return None, None

In [8]:
# Apply the fingerprint generation function to COX2 DataFrame
cox2_results = Parallel(n_jobs=-1)(delayed(generate_fingerprint)(mol) for mol in cox2_df['Molecule'])
cox2_bit_vectors, cox2_sparse_fingerprints = zip(*cox2_results)

# Assign the new columns to the COX2 DataFrame
cox2_df['FingerprintBits'] = list(cox2_bit_vectors)
cox2_df['SparseFingerprintBits'] = list(cox2_sparse_fingerprints)

# Apply the fingerprint generation function to Janus DataFrame
janus_results = Parallel(n_jobs=-1)(delayed(generate_fingerprint)(mol) for mol in janus_df['Molecule'])
janus_bit_vectors, janus_sparse_fingerprints = zip(*janus_results)

# Assign the new columns to the Janus DataFrame
janus_df['FingerprintBits'] = list(janus_bit_vectors)
janus_df['SparseFingerprintBits'] = list(janus_sparse_fingerprints)


In [9]:
cox2_df.head()

Unnamed: 0,title,SMILES,Molecule,FingerprintBits,SparseFingerprintBits
0,Parecoxib,CCC(=O)NS(=O)(=O)c1ccc(cc1)c2c(C)onc2c3ccccc3,<rdkit.Chem.rdchem.Mol object at 0x7fbb264ad4d0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",41 80 179 248 294 307 319 323 331 350 361 378 389 461 471 476 502 624 650 656 715 736 807 835 883 896 898 1045 1057 1088 1141 1152 1160 1199 1366 1380 1476 1542 1722 1746 1747 1750 1873 1917
1,Anitrazafen,COc1ccc(cc1)c2nnc(C)nc2c3ccc(OC)cc3,<rdkit.Chem.rdchem.Mol object at 0x7fbb264ad5b0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",123 139 269 322 323 357 378 454 471 695 718 781 831 841 896 978 1057 1126 1160 1164 1380 1536 1722 1750 1855 1873
2,Celecoxib,Cc1ccc(cc1)c2cc(nn2c3ccc(cc3)S(=O)(=O)N)C(F)(F)F,<rdkit.Chem.rdchem.Mol object at 0x7fbb264ad620>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",114 233 235 319 350 368 378 650 715 753 809 833 875 896 935 1057 1160 1171 1197 1328 1380 1399 1434 1440 1446 1453 1476 1489 1527 1607 1692 1722 1750 1823 1825 1852 1873 1920 1928 1956
3,Cimicoxib,COc1ccc(cc1F)c2c(Cl)ncn2c3ccc(cc3)S(=O)(=O)N,<rdkit.Chem.rdchem.Mol object at 0x7fbb264ad690>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...]",27 94 118 233 249 289 319 339 350 378 561 564 650 695 699 704 715 724 746 833 841 875 879 896 932 935 1057 1160 1171 1197 1260 1380 1445 1452 1476 1489 1536 1607 1649 1683 1750 1825 1840 1873 1928 1945 1970
4,Deracoxib,COc1ccc(cc1F)c2cc(nn2c3ccc(cc3)S(=O)(=O)N)C(F)F,<rdkit.Chem.rdchem.Mol object at 0x7fbb264ad700>,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...]",1 94 118 233 249 259 305 319 350 378 501 650 695 699 715 833 841 875 896 935 991 1050 1057 1160 1171 1197 1328 1380 1399 1405 1440 1476 1489 1536 1607 1649 1692 1750 1780 1825 1840 1846 1852 1873 1928 1945 1956 1970


In [10]:
janus_df.head()

Unnamed: 0,title,SMILES,Molecule,FingerprintBits,SparseFingerprintBits
0,Tofacitinib,C[C@@H]1CCN(C[C@@H]1N(C)c2ncnc3[nH]ccc23)C(=O)CC#N,<rdkit.Chem.rdchem.Mol object at 0x7fbb264adf50>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",51 72 80 87 259 361 369 378 417 479 489 506 560 590 650 652 674 678 711 739 788 790 806 807 881 926 935 1009 1019 1026 1057 1089 1114 1152 1163 1171 1309 1325 1357 1380 1384 1452 1480 1506 1750 1764 1807 1810 1853 1859 1860 1873 1917 1973 2009
1,Ruxolitinib,N#CC[C@H](C1CCCC1)n2cc(cn2)c3ncnc4[nH]ccc34,<rdkit.Chem.rdchem.Mol object at 0x7fbb264adfc0>,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",1 43 80 87 276 323 335 339 360 378 464 479 489 674 711 741 790 806 808 890 926 935 944 1019 1026 1028 1051 1089 1114 1152 1160 1161 1171 1256 1325 1348 1357 1380 1384 1452 1506 1535 1634 1706 1739 1750 1810 1873 1876 1897 1978
2,Oclacitinib,CNS(=O)(=O)C[C@@H]1CC[C@H](CC1)N(C)c2ncnc3[nH]ccc23,<rdkit.Chem.rdchem.Mol object at 0x7fbb264ae030>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",29 51 72 80 87 169 259 312 350 369 378 479 489 565 650 711 788 806 809 881 904 926 1019 1025 1026 1057 1114 1152 1154 1163 1171 1325 1357 1380 1430 1452 1454 1476 1506 1524 1561 1671 1750 1810 1873 2018
3,Baricitinib,CCS(=O)(=O)N1CC(CC#N)(C1)n2cc(cn2)c3ncnc4[nH]ccc34,<rdkit.Chem.rdchem.Mol object at 0x7fbb264ae0a0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",43 80 87 141 225 226 276 294 323 350 378 464 479 489 585 602 650 674 675 711 723 741 790 806 872 923 926 935 1026 1057 1060 1089 1114 1152 1160 1171 1256 1357 1380 1384 1452 1476 1502 1506 1535 1700 1750 1810 1817 1873 1876 1897 1915 1971
4,Upadacitinib,CC[C@@H]1CN(C[C@@H]1c2cnc3cnc4[nH]ccc4n23)C(=O)NCC(F)(F)F,<rdkit.Chem.rdchem.Mol object at 0x7fbb264ae110>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",45 48 80 114 187 197 213 218 226 294 311 378 455 479 503 510 546 548 607 638 650 711 739 806 807 817 819 887 926 935 1009 1019 1039 1057 1092 1104 1114 1123 1152 1225 1228 1231 1380 1452 1453 1454 1506 1535 1549 1750 1799 1826 1873 1917 1928 2009


### Extract Nearby Neighbors of parecoxib
***

In [11]:
parecoxib = cox2_df[cox2_df['title'] == 'Parecoxib']
anitrazafen = cox2_df[cox2_df['title'] == 'Anitrazafen']
celecoxib = cox2_df[cox2_df['title'] == 'Celecoxib']
cimicoxib = cox2_df[cox2_df['title'] == 'Cimicoxib']
deracoxib = cox2_df[cox2_df['title'] == 'Deracoxib']

### Flip Bits
***

In [12]:
def flip_single_bit(vector: List[int], position: int) -> List[int]:
    """Flip only the bit at the given position in the vector."""
    flipped_vector = vector.copy()
    flipped_vector[position] = 1 - flipped_vector[position]
    return flipped_vector

def sequential_bit_flipping(fingerprint: List[int], start: int = None, end: int = None) -> List[Tuple[int, List[int]]]:
    """Perform sequential single bit flipping on the fingerprint from LSB to MSB."""
    if end is None:
        end = len(fingerprint)
    if start is None:
        start = 0
    
    results = []
    for i in range(end - 1, start - 1, -1):  # Reverse order: from right (LSB) to left (MSB)
        flipped_fingerprint = flip_single_bit(fingerprint, i)
        results.append((i, flipped_fingerprint))
    return results

def block_based_flipping(fingerprint: List[int], block_size: int, block_number: int = None) -> List[Tuple[int, List[int]]]:
    """Perform block-based single bit flipping on the fingerprint from LSB to MSB."""
    total_blocks = len(fingerprint) // block_size
    if block_number is not None:
        if block_number < 0 or block_number >= total_blocks:
            raise ValueError(f"Block number must be between 0 and {total_blocks - 1}")
        start = len(fingerprint) - (block_number + 1) * block_size
        end = start + block_size
        return sequential_bit_flipping(fingerprint, start, end)
    else:
        results = []
        for start in range(len(fingerprint) - block_size, -1, -block_size):
            end = min(start + block_size, len(fingerprint))
            results.extend(sequential_bit_flipping(fingerprint, start, end))
        return results

def sparse_to_dense(sparse_fingerprint: List[int], size: int = 2048) -> List[int]:
    """Convert sparse fingerprint to dense fingerprint."""
    dense = [0] * size
    for bit in sparse_fingerprint:
        dense[bit] = 1
    return dense

def dense_to_sparse(dense_fingerprint: List[int]) -> List[int]:
    """Convert dense fingerprint to sparse fingerprint."""
    return [i for i, bit in enumerate(dense_fingerprint) if bit == 1]

def generate_flipped_fingerprints(row: pd.Series, block_size: int = None, block_number: int = None) -> List[dict]:
    """Generate single bit flipped fingerprints for a single row."""
    dense_fingerprint = row['FingerprintBits']
    sparse_fingerprint = row['SparseFingerprintBits']
    
    if block_size:
        flipped_results = block_based_flipping(dense_fingerprint, block_size, block_number)
    else:
        flipped_results = sequential_bit_flipping(dense_fingerprint)
    
    new_rows = []
    for position, flipped_dense in flipped_results:
        flipped_sparse = dense_to_sparse(flipped_dense)
        new_row = {
            'title': row['title'],
            'SMILES': row['SMILES'],
            'FingerprintBits': dense_fingerprint,
            'SparseFingerprintBits': sparse_fingerprint,
            'FingerprintBitsFlipped': flipped_dense,
            'SparseFingerprintBitsFlipped': flipped_sparse,
            'FlippedBitPosition': position
        }
        new_rows.append(new_row)
    
    return new_rows

def process_dataframe(df: pd.DataFrame, block_size: int = None, block_number: int = None) -> pd.DataFrame:
    """Process the entire dataframe and generate single bit flipped fingerprints."""
    all_new_rows = []
    for _, row in df.iterrows():
        new_rows = generate_flipped_fingerprints(row, block_size, block_number)
        all_new_rows.extend(new_rows)
    
    return pd.DataFrame(all_new_rows)


In [13]:
parecoxib_flipped_df = process_dataframe(parecoxib)
anitrazafen_flipped_df = process_dataframe(anitrazafen)
celecoxib_flipped_df = process_dataframe(celecoxib)
cimicoxib_flipped_df = process_dataframe(cimicoxib)
deracoxib_flipped_df = process_dataframe(deracoxib)


In [14]:
deracoxib_flipped_df.head()

Unnamed: 0,title,SMILES,FingerprintBits,SparseFingerprintBits,FingerprintBitsFlipped,SparseFingerprintBitsFlipped,FlippedBitPosition
0,Deracoxib,COc1ccc(cc1F)c2cc(nn2c3ccc(cc3)S(=O)(=O)N)C(F)F,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...]",1 94 118 233 249 259 305 319 350 378 501 650 695 699 715 833 841 875 896 935 991 1050 1057 1160 1171 1197 1328 1380 1399 1405 1440 1476 1489 1536 1607 1649 1692 1750 1780 1825 1840 1846 1852 1873 1928 1945 1956 1970,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...]","[1, 94, 118, 233, 249, 259, 305, 319, 350, 378, 501, 650, 695, 699, 715, 833, 841, 875, 896, 935, 991, 1050, 1057, 1160, 1171, 1197, 1328, 1380, 1399, 1405, 1440, 1476, 1489, 1536, 1607, 1649, 1692, 1750, 1780, 1825, 1840, 1846, 1852, 1873, 1928, 1945, 1956, 1970, 2047]",2047
1,Deracoxib,COc1ccc(cc1F)c2cc(nn2c3ccc(cc3)S(=O)(=O)N)C(F)F,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...]",1 94 118 233 249 259 305 319 350 378 501 650 695 699 715 833 841 875 896 935 991 1050 1057 1160 1171 1197 1328 1380 1399 1405 1440 1476 1489 1536 1607 1649 1692 1750 1780 1825 1840 1846 1852 1873 1928 1945 1956 1970,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...]","[1, 94, 118, 233, 249, 259, 305, 319, 350, 378, 501, 650, 695, 699, 715, 833, 841, 875, 896, 935, 991, 1050, 1057, 1160, 1171, 1197, 1328, 1380, 1399, 1405, 1440, 1476, 1489, 1536, 1607, 1649, 1692, 1750, 1780, 1825, 1840, 1846, 1852, 1873, 1928, 1945, 1956, 1970, 2046]",2046
2,Deracoxib,COc1ccc(cc1F)c2cc(nn2c3ccc(cc3)S(=O)(=O)N)C(F)F,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...]",1 94 118 233 249 259 305 319 350 378 501 650 695 699 715 833 841 875 896 935 991 1050 1057 1160 1171 1197 1328 1380 1399 1405 1440 1476 1489 1536 1607 1649 1692 1750 1780 1825 1840 1846 1852 1873 1928 1945 1956 1970,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...]","[1, 94, 118, 233, 249, 259, 305, 319, 350, 378, 501, 650, 695, 699, 715, 833, 841, 875, 896, 935, 991, 1050, 1057, 1160, 1171, 1197, 1328, 1380, 1399, 1405, 1440, 1476, 1489, 1536, 1607, 1649, 1692, 1750, 1780, 1825, 1840, 1846, 1852, 1873, 1928, 1945, 1956, 1970, 2045]",2045
3,Deracoxib,COc1ccc(cc1F)c2cc(nn2c3ccc(cc3)S(=O)(=O)N)C(F)F,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...]",1 94 118 233 249 259 305 319 350 378 501 650 695 699 715 833 841 875 896 935 991 1050 1057 1160 1171 1197 1328 1380 1399 1405 1440 1476 1489 1536 1607 1649 1692 1750 1780 1825 1840 1846 1852 1873 1928 1945 1956 1970,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...]","[1, 94, 118, 233, 249, 259, 305, 319, 350, 378, 501, 650, 695, 699, 715, 833, 841, 875, 896, 935, 991, 1050, 1057, 1160, 1171, 1197, 1328, 1380, 1399, 1405, 1440, 1476, 1489, 1536, 1607, 1649, 1692, 1750, 1780, 1825, 1840, 1846, 1852, 1873, 1928, 1945, 1956, 1970, 2044]",2044
4,Deracoxib,COc1ccc(cc1F)c2cc(nn2c3ccc(cc3)S(=O)(=O)N)C(F)F,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...]",1 94 118 233 249 259 305 319 350 378 501 650 695 699 715 833 841 875 896 935 991 1050 1057 1160 1171 1197 1328 1380 1399 1405 1440 1476 1489 1536 1607 1649 1692 1750 1780 1825 1840 1846 1852 1873 1928 1945 1956 1970,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...]","[1, 94, 118, 233, 249, 259, 305, 319, 350, 378, 501, 650, 695, 699, 715, 833, 841, 875, 896, 935, 991, 1050, 1057, 1160, 1171, 1197, 1328, 1380, 1399, 1405, 1440, 1476, 1489, 1536, 1607, 1649, 1692, 1750, 1780, 1825, 1840, 1846, 1852, 1873, 1928, 1945, 1956, 1970, 2043]",2043


### MolForge predictions on flipped fingerprints
***


In [15]:
# Set user directory
USER_DIR = os.path.expanduser('~')

# Path to the conda environment to MolForge
CONDA_ENV_PATH = os.path.join(USER_DIR, 'conda/envs/molforge/bin/python')
MOLFORGE_DIR = os.path.join(USER_DIR, 'MolForge')
MOLFORGE_SCRIPT_PATH = os.path.join(USER_DIR, 'MolForge/predict.py')
WORKERS = 4

In [16]:
def convert_spaced_smiles(spaced_smiles):
    compact_smiles = spaced_smiles.replace(" ", "")
    mol = Chem.MolFromSmiles(compact_smiles)
    if mol is not None:
        return Chem.MolToSmiles(mol, isomericSmiles=False)
    else:
        return "Invalid SMILES string"

In [17]:
def run_molforge(fp_input):
    # Ensure fp_input is a string
    if isinstance(fp_input, list):
        fp_input_str = ' '.join(map(str, fp_input))
    else:
        fp_input_str = fp_input
    
    command = [
        CONDA_ENV_PATH,
        MOLFORGE_SCRIPT_PATH,
        "--fp=ECFP4",
        "--model_type=smiles",
        f"--input={fp_input_str}"
    ]
    
    try:
        result = subprocess.run(
            command,
            capture_output=True,
            text=True,
            check=True,
            cwd=MOLFORGE_DIR,
            env=os.environ.copy()
        )
    except subprocess.CalledProcessError as e:
        print(f"Error running MolForge: {e}")
        print(f"Error output: {e.stderr}")
        return "Error", "Error"
    
    spaced_smiles = None
    device_used = None
    output_lines = result.stdout.splitlines()
    
    for line in output_lines:
        if "Result:" in line:
            spaced_smiles = line.split("Result:")[1].strip()
        if "rank :" in line:
            device_used = line.split("rank :")[1].strip()
    
    if spaced_smiles:
        compact_smiles = convert_spaced_smiles(spaced_smiles)
    else:
        compact_smiles = "No Result"

    return compact_smiles, device_used

In [18]:
def process_row(row):
    fp_input = row['SparseFingerprintBitsFlipped']
    compact_smiles, device_used = run_molforge(fp_input)
    return compact_smiles, device_used

def process_dataframe(df, max_rows=None):
    if max_rows:
        df = df.head(max_rows)
    
    total = len(df)
    with ThreadPoolExecutor(max_workers=WORKERS) as executor:
        futures = {executor.submit(process_row, row): idx for idx, row in df.iterrows()}
        
        for i, future in enumerate(as_completed(futures)):
            idx = futures[future]
            result, device_used = future.result()
            df.at[idx, 'Generated_SMILES'] = result
            df.at[idx, 'Device_Used'] = device_used
            if (i + 1) % 100 == 0 or (i + 1) == total:
                print(f"Processed molecule {i + 1}/{total}")
    
    return df

In [19]:
parecoxib_flipped_df = process_dataframe(parecoxib_flipped_df)
 parecoxib_flipped_df.to_parquet(os.path.join(OUTPUT_DIR, 'parecoxib_flipped_df.parquet'), index=False)

In [20]:
anitrazafen_flipped_df = process_dataframe(anitrazafen_flipped_df)
anitrazafen_flipped_df.to_parquet(os.path.join(OUTPUT_DIR, 'anitrazafen_flipped_df.parquet'), index=False)

Processed molecule 100/2048
Processed molecule 200/2048
Processed molecule 300/2048
Processed molecule 400/2048
Processed molecule 500/2048
Processed molecule 600/2048
Processed molecule 700/2048
Processed molecule 800/2048
Processed molecule 900/2048
Processed molecule 1000/2048
Processed molecule 1100/2048
Processed molecule 1200/2048
Processed molecule 1300/2048
Processed molecule 1400/2048
Processed molecule 1500/2048
Processed molecule 1600/2048
Processed molecule 1700/2048
Processed molecule 1800/2048
Processed molecule 1900/2048
Processed molecule 2000/2048
Processed molecule 2048/2048


In [21]:
celecoxib_flipped_df = process_dataframe(celecoxib_flipped_df)
celecoxib_flipped_df.to_parquet(os.path.join(OUTPUT_DIR, 'celecoxib_flipped_df.parquet'), index=False)

In [49]:
cimicoxib_flipped_df = process_dataframe(cimicoxib_flipped_df)
cimicoxib_flipped_df.to_parquet(os.path.join(OUTPUT_DIR, 'cimicoxib_flipped_df.parquet'), index=False)

Processed molecule 100/2048
Processed molecule 200/2048
Processed molecule 300/2048
Processed molecule 400/2048
Processed molecule 500/2048
Processed molecule 600/2048
Processed molecule 700/2048
Processed molecule 800/2048
Processed molecule 900/2048
Processed molecule 1000/2048
Processed molecule 1100/2048
Processed molecule 1200/2048
Processed molecule 1300/2048
Processed molecule 1400/2048
Processed molecule 1500/2048
Processed molecule 1600/2048
Processed molecule 1700/2048
Processed molecule 1800/2048
Processed molecule 1900/2048
Processed molecule 2000/2048
Processed molecule 2048/2048


In [50]:
deracoxib_flipped_df = process_dataframe(deracoxib_flipped_df)
deracoxib_flipped_df.to_parquet(os.path.join(OUTPUT_DIR, 'deracoxib_flipped_df.parquet'), index=False)

Processed molecule 100/2048
Processed molecule 200/2048
Processed molecule 300/2048
Processed molecule 400/2048
Processed molecule 500/2048
Processed molecule 600/2048
Processed molecule 700/2048
Processed molecule 800/2048
Processed molecule 900/2048


[08:51:25] SMILES Parse Error: extra close parentheses while parsing: COC1=C(C=C(C=C1)C2=CC(=NN2C3=CC=C(C=C3)S(=O)(=O)N)C(C4=NN(C(=C4)C5=CC(=C(C=C5)OC)F)C6=CC=C(C=C6)S(=O)(=O)N)C7=CC(=C=C)F)F)F
[08:51:25] SMILES Parse Error: Failed parsing SMILES 'COC1=C(C=C(C=C1)C2=CC(=NN2C3=CC=C(C=C3)S(=O)(=O)N)C(C4=NN(C(=C4)C5=CC(=C(C=C5)OC)F)C6=CC=C(C=C6)S(=O)(=O)N)C7=CC(=C=C)F)F)F' for input: 'COC1=C(C=C(C=C1)C2=CC(=NN2C3=CC=C(C=C3)S(=O)(=O)N)C(C4=NN(C(=C4)C5=CC(=C(C=C5)OC)F)C6=CC=C(C=C6)S(=O)(=O)N)C7=CC(=C=C)F)F)F'


Processed molecule 1000/2048
Processed molecule 1100/2048
Processed molecule 1200/2048
Processed molecule 1300/2048
Processed molecule 1400/2048
Processed molecule 1500/2048
Processed molecule 1600/2048
Processed molecule 1700/2048
Processed molecule 1800/2048
Processed molecule 1900/2048
Processed molecule 2000/2048
Processed molecule 2048/2048
