# Project: Decoding Molecules From Fingerprints.
## Group Members:
### Qi Chen, e-mail: gusqichr@student.gu.se
### Nils Dunlop, e-mail: gusdunlni@student.gu.se
### Francisco Alejandro Erazo Piza, e-mail: guserafr@student.gu.se
***

In [46]:
import os
import pandas as pd
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem import PandasTools
import numpy as np
from joblib import Parallel, delayed
from rdkit.DataStructs import TanimotoSimilarity
import subprocess
from typing import List, Tuple
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed
from rdkit import Chem
import random
from typing import List, Tuple
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

### Define directories
***

In [47]:
# Set user directory
USER_DIR = os.path.expanduser('~')

# Set the project root directory
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Define the input and output directories
INPUT_DIR = os.path.join(PROJECT_ROOT, 'data/')
OUTPUT_DIR = os.path.join(PROJECT_ROOT, 'data/bit_flipping_nn_review_2')

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)


### COX2 and Janus Data
***

In [48]:
# Read COX2_SMILES.csv
cox2_df = pd.read_csv('../data/COX2_SMILES.csv', delimiter=';')
print(f"COX2 shape: {cox2_df.shape}")

# Read Janus_SMILES.csv
janus_df = pd.read_csv('../data/Janus_SMILES.csv', delimiter=';')
print(f"Janus shape: {janus_df.shape}")

COX2 shape: (21, 2)
Janus shape: (12, 2)


In [49]:
# Add molecule column to the dataframes
PandasTools.AddMoleculeColumnToFrame(cox2_df, smilesCol='SMILES', molCol='Molecule')
PandasTools.AddMoleculeColumnToFrame(janus_df, smilesCol='SMILES', molCol='Molecule')

In [50]:
cox2_df.head()

Unnamed: 0,title,SMILES,Molecule
0,Parecoxib,CCC(=O)NS(=O)(=O)c1ccc(cc1)c2c(C)onc2c3ccccc3,<rdkit.Chem.rdchem.Mol object at 0x7fc79cfb6ff0>
1,Anitrazafen,COc1ccc(cc1)c2nnc(C)nc2c3ccc(OC)cc3,<rdkit.Chem.rdchem.Mol object at 0x7fc79cfb7140>
2,Celecoxib,Cc1ccc(cc1)c2cc(nn2c3ccc(cc3)S(=O)(=O)N)C(F)(F)F,<rdkit.Chem.rdchem.Mol object at 0x7fc79cfb70d0>
3,Cimicoxib,COc1ccc(cc1F)c2c(Cl)ncn2c3ccc(cc3)S(=O)(=O)N,<rdkit.Chem.rdchem.Mol object at 0x7fc79cfb7a00>
4,Deracoxib,COc1ccc(cc1F)c2cc(nn2c3ccc(cc3)S(=O)(=O)N)C(F)F,<rdkit.Chem.rdchem.Mol object at 0x7fc79cfb73e0>


In [51]:
janus_df.head()

Unnamed: 0,title,SMILES,Molecule
0,Tofacitinib,C[C@@H]1CCN(C[C@@H]1N(C)c2ncnc3[nH]ccc23)C(=O)CC#N,<rdkit.Chem.rdchem.Mol object at 0x7fc79cfb7530>
1,Ruxolitinib,N#CC[C@H](C1CCCC1)n2cc(cn2)c3ncnc4[nH]ccc34,<rdkit.Chem.rdchem.Mol object at 0x7fc79cfb7680>
2,Oclacitinib,CNS(=O)(=O)C[C@@H]1CC[C@H](CC1)N(C)c2ncnc3[nH]ccc23,<rdkit.Chem.rdchem.Mol object at 0x7fc79cfb7c30>
3,Baricitinib,CCS(=O)(=O)N1CC(CC#N)(C1)n2cc(cn2)c3ncnc4[nH]ccc34,<rdkit.Chem.rdchem.Mol object at 0x7fc79cfb7b50>
4,Upadacitinib,CC[C@@H]1CN(C[C@@H]1c2cnc3cnc4[nH]ccc4n23)C(=O)NCC(F)(F)F,<rdkit.Chem.rdchem.Mol object at 0x7fc79cfb7990>


In [52]:
def generate_fingerprint(mol):
    """
    Generate ECFP4 fingerprint and sparse representation for a molecule.
    """
    if mol is not None:
        morgan_generator = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
        fp = morgan_generator.GetFingerprint(mol)
        bit_vector = np.array(list(fp.ToBitString())).astype(int)
        sparse_representation = list(np.where(bit_vector == 1)[0])
        sparse_representation_str = ' '.join(map(str, sparse_representation))
        return bit_vector, sparse_representation_str
    return None, None

In [53]:
# Apply the fingerprint generation function to COX2 DataFrame
cox2_results = Parallel(n_jobs=-1)(delayed(generate_fingerprint)(mol) for mol in cox2_df['Molecule'])
cox2_bit_vectors, cox2_sparse_fingerprints = zip(*cox2_results)

# Assign the new columns to the COX2 DataFrame
cox2_df['FingerprintBits'] = list(cox2_bit_vectors)
cox2_df['SparseFingerprintBits'] = list(cox2_sparse_fingerprints)

# Apply the fingerprint generation function to Janus DataFrame
janus_results = Parallel(n_jobs=-1)(delayed(generate_fingerprint)(mol) for mol in janus_df['Molecule'])
janus_bit_vectors, janus_sparse_fingerprints = zip(*janus_results)

# Assign the new columns to the Janus DataFrame
janus_df['FingerprintBits'] = list(janus_bit_vectors)
janus_df['SparseFingerprintBits'] = list(janus_sparse_fingerprints)


In [54]:
cox2_df.head()

Unnamed: 0,title,SMILES,Molecule,FingerprintBits,SparseFingerprintBits
0,Parecoxib,CCC(=O)NS(=O)(=O)c1ccc(cc1)c2c(C)onc2c3ccccc3,<rdkit.Chem.rdchem.Mol object at 0x7fc79cfb6ff0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",41 80 179 248 294 307 319 323 331 350 361 378 389 461 471 476 502 624 650 656 715 736 807 835 883 896 898 1045 1057 1088 1141 1152 1160 1199 1366 1380 1476 1542 1722 1746 1747 1750 1873 1917
1,Anitrazafen,COc1ccc(cc1)c2nnc(C)nc2c3ccc(OC)cc3,<rdkit.Chem.rdchem.Mol object at 0x7fc79cfb7140>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",123 139 269 322 323 357 378 454 471 695 718 781 831 841 896 978 1057 1126 1160 1164 1380 1536 1722 1750 1855 1873
2,Celecoxib,Cc1ccc(cc1)c2cc(nn2c3ccc(cc3)S(=O)(=O)N)C(F)(F)F,<rdkit.Chem.rdchem.Mol object at 0x7fc79cfb70d0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",114 233 235 319 350 368 378 650 715 753 809 833 875 896 935 1057 1160 1171 1197 1328 1380 1399 1434 1440 1446 1453 1476 1489 1527 1607 1692 1722 1750 1823 1825 1852 1873 1920 1928 1956
3,Cimicoxib,COc1ccc(cc1F)c2c(Cl)ncn2c3ccc(cc3)S(=O)(=O)N,<rdkit.Chem.rdchem.Mol object at 0x7fc79cfb7a00>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...]",27 94 118 233 249 289 319 339 350 378 561 564 650 695 699 704 715 724 746 833 841 875 879 896 932 935 1057 1160 1171 1197 1260 1380 1445 1452 1476 1489 1536 1607 1649 1683 1750 1825 1840 1873 1928 1945 1970
4,Deracoxib,COc1ccc(cc1F)c2cc(nn2c3ccc(cc3)S(=O)(=O)N)C(F)F,<rdkit.Chem.rdchem.Mol object at 0x7fc79cfb73e0>,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...]",1 94 118 233 249 259 305 319 350 378 501 650 695 699 715 833 841 875 896 935 991 1050 1057 1160 1171 1197 1328 1380 1399 1405 1440 1476 1489 1536 1607 1649 1692 1750 1780 1825 1840 1846 1852 1873 1928 1945 1956 1970


In [55]:
janus_df.head()

Unnamed: 0,title,SMILES,Molecule,FingerprintBits,SparseFingerprintBits
0,Tofacitinib,C[C@@H]1CCN(C[C@@H]1N(C)c2ncnc3[nH]ccc23)C(=O)CC#N,<rdkit.Chem.rdchem.Mol object at 0x7fc79cfb7530>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",51 72 80 87 259 361 369 378 417 479 489 506 560 590 650 652 674 678 711 739 788 790 806 807 881 926 935 1009 1019 1026 1057 1089 1114 1152 1163 1171 1309 1325 1357 1380 1384 1452 1480 1506 1750 1764 1807 1810 1853 1859 1860 1873 1917 1973 2009
1,Ruxolitinib,N#CC[C@H](C1CCCC1)n2cc(cn2)c3ncnc4[nH]ccc34,<rdkit.Chem.rdchem.Mol object at 0x7fc79cfb7680>,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",1 43 80 87 276 323 335 339 360 378 464 479 489 674 711 741 790 806 808 890 926 935 944 1019 1026 1028 1051 1089 1114 1152 1160 1161 1171 1256 1325 1348 1357 1380 1384 1452 1506 1535 1634 1706 1739 1750 1810 1873 1876 1897 1978
2,Oclacitinib,CNS(=O)(=O)C[C@@H]1CC[C@H](CC1)N(C)c2ncnc3[nH]ccc23,<rdkit.Chem.rdchem.Mol object at 0x7fc79cfb7c30>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",29 51 72 80 87 169 259 312 350 369 378 479 489 565 650 711 788 806 809 881 904 926 1019 1025 1026 1057 1114 1152 1154 1163 1171 1325 1357 1380 1430 1452 1454 1476 1506 1524 1561 1671 1750 1810 1873 2018
3,Baricitinib,CCS(=O)(=O)N1CC(CC#N)(C1)n2cc(cn2)c3ncnc4[nH]ccc34,<rdkit.Chem.rdchem.Mol object at 0x7fc79cfb7b50>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",43 80 87 141 225 226 276 294 323 350 378 464 479 489 585 602 650 674 675 711 723 741 790 806 872 923 926 935 1026 1057 1060 1089 1114 1152 1160 1171 1256 1357 1380 1384 1452 1476 1502 1506 1535 1700 1750 1810 1817 1873 1876 1897 1915 1971
4,Upadacitinib,CC[C@@H]1CN(C[C@@H]1c2cnc3cnc4[nH]ccc4n23)C(=O)NCC(F)(F)F,<rdkit.Chem.rdchem.Mol object at 0x7fc79cfb7990>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",45 48 80 114 187 197 213 218 226 294 311 378 455 479 503 510 546 548 607 638 650 711 739 806 807 817 819 887 926 935 1009 1019 1039 1057 1092 1104 1114 1123 1152 1225 1228 1231 1380 1452 1453 1454 1506 1535 1549 1750 1799 1826 1873 1917 1928 2009


### Extract Nearby Neighbors of parecoxib
***

In [56]:
parecoxib = cox2_df[cox2_df['title'] == 'Parecoxib']
anitrazafen = cox2_df[cox2_df['title'] == 'Anitrazafen']
celecoxib = cox2_df[cox2_df['title'] == 'Celecoxib']
cimicoxib = cox2_df[cox2_df['title'] == 'Cimicoxib']
deracoxib = cox2_df[cox2_df['title'] == 'Deracoxib']

### Flip Bits
***

In [57]:
def flip_bits(vector: List[int], positions: List[int]) -> List[int]:
    """Flip the bits at the given positions in the vector."""
    flipped_vector = vector.copy()
    for position in positions:
        flipped_vector[position] = 1 - flipped_vector[position]
    return flipped_vector

def batched_random_bit_flipping(fingerprint: List[int], batch_size: int) -> List[Tuple[List[int], List[int]]]:
    """Perform batched random bit flipping on the fingerprint."""
    total_bits = len(fingerprint)
    all_positions = list(range(total_bits))
    random.shuffle(all_positions)
    
    results = []
    for i in range(0, total_bits, batch_size):
        batch_positions = all_positions[i:i+batch_size]
        flipped_fingerprint = flip_bits(fingerprint, batch_positions)
        results.append((batch_positions, flipped_fingerprint))
    
    return results

def sparse_to_dense(sparse_fingerprint: List[int], size: int = 2048) -> List[int]:
    """Convert sparse fingerprint to dense fingerprint."""
    dense = [0] * size
    for bit in sparse_fingerprint:
        dense[bit] = 1
    return dense

def dense_to_sparse(dense_fingerprint: List[int]) -> List[int]:
    """Convert dense fingerprint to sparse fingerprint."""
    return [i for i, bit in enumerate(dense_fingerprint) if bit == 1]

def generate_flipped_fingerprints(row: pd.Series, batch_size: int) -> List[dict]:
    """Generate batched random bit flipped fingerprints for a single row."""
    dense_fingerprint = row['FingerprintBits']
    sparse_fingerprint = row['SparseFingerprintBits']

    flipped_results = batched_random_bit_flipping(dense_fingerprint, batch_size)

    new_rows = []
    for positions, flipped_dense in flipped_results:
        flipped_sparse = dense_to_sparse(flipped_dense)
        new_row = {
            'title': row['title'],
            'SMILES': row['SMILES'],
            'FingerprintBits': dense_fingerprint,
            'SparseFingerprintBits': sparse_fingerprint,
            'FingerprintBitsFlipped': flipped_dense,
            'SparseFingerprintBitsFlipped': flipped_sparse,
            'FlippedBitPositions': positions
        }
        new_rows.append(new_row)

    return new_rows


def process_dataframe(df: pd.DataFrame, batch_size: int) -> pd.DataFrame:
    """Process the entire dataframe and generate batched random bit flipped fingerprints."""
    all_new_rows = []
    for _, row in df.iterrows():
        new_rows = generate_flipped_fingerprints(row, batch_size)
        all_new_rows.extend(new_rows)

    return pd.DataFrame(all_new_rows)

In [58]:
# 8 random bit flips
parecoxib_df_8_random_flips = process_dataframe(parecoxib, 8)
anitrazafen_df_8_random_flips = process_dataframe(anitrazafen, 8)
celecoxib_df_8_random_flips = process_dataframe(celecoxib, 8)
cimicoxib_df_8_random_flips = process_dataframe(cimicoxib, 8)
deracoxib_df_8_random_flips = process_dataframe(deracoxib, 8)

In [59]:
# 128 random bit flips
parecoxib_df_128_random_flips = process_dataframe(parecoxib, 128)
anitrazafen_df_128_random_flips = process_dataframe(anitrazafen, 128)
celecoxib_df_128_random_flips = process_dataframe(celecoxib, 128)
cimicoxib_df_128_random_flips = process_dataframe(cimicoxib, 128)
deracoxib_df_128_random_flips = process_dataframe(deracoxib, 128)

In [60]:
# 1024 random bit flips
parecoxib_df_1024_random_flips = process_dataframe(parecoxib, 1024)
anitrazafen_df_1024_random_flips = process_dataframe(anitrazafen, 1024)
celecoxib_df_1024_random_flips = process_dataframe(celecoxib, 1024)
cimicoxib_df_1024_random_flips = process_dataframe(cimicoxib, 1024)
deracoxib_df_1024_random_flips = process_dataframe(deracoxib, 1024)

### MolForge predictions on flipped fingerprints
***


In [61]:
# Set user directory
USER_DIR = os.path.expanduser('~')

# Path to the conda environment to MolForge
CONDA_ENV_PATH = os.path.join(USER_DIR, 'conda/envs/molforge/bin/python')
MOLFORGE_DIR = os.path.join(USER_DIR, 'MolForge')
MOLFORGE_SCRIPT_PATH = os.path.join(USER_DIR, 'MolForge/predict.py')
WORKERS = 4

In [62]:
def convert_spaced_smiles(spaced_smiles):
    compact_smiles = spaced_smiles.replace(" ", "")
    mol = Chem.MolFromSmiles(compact_smiles)
    if mol is not None:
        return Chem.MolToSmiles(mol, isomericSmiles=False)
    else:
        return "Invalid SMILES string"

In [63]:
def run_molforge(fp_input):
    # Ensure fp_input is a string
    if isinstance(fp_input, list):
        fp_input_str = ' '.join(map(str, fp_input))
    else:
        fp_input_str = fp_input
    
    command = [
        CONDA_ENV_PATH,
        MOLFORGE_SCRIPT_PATH,
        "--fp=ECFP4",
        "--model_type=smiles",
        f"--input={fp_input_str}"
    ]
    
    try:
        result = subprocess.run(
            command,
            capture_output=True,
            text=True,
            check=True,
            cwd=MOLFORGE_DIR,
            env=os.environ.copy()
        )
    except subprocess.CalledProcessError as e:
        print(f"Error running MolForge: {e}")
        print(f"Error output: {e.stderr}")
        return "Error", "Error"
    
    spaced_smiles = None
    device_used = None
    output_lines = result.stdout.splitlines()
    
    for line in output_lines:
        if "Result:" in line:
            spaced_smiles = line.split("Result:")[1].strip()
        if "rank :" in line:
            device_used = line.split("rank :")[1].strip()
    
    if spaced_smiles:
        compact_smiles = convert_spaced_smiles(spaced_smiles)
    else:
        compact_smiles = "No Result"

    return compact_smiles, device_used

In [64]:
def process_row(row):
    fp_input = row['SparseFingerprintBitsFlipped']
    compact_smiles, device_used = run_molforge(fp_input)
    return compact_smiles, device_used

def process_dataframe(df, max_rows=None):
    if max_rows:
        df = df.head(max_rows)
    
    total = len(df)
    with ThreadPoolExecutor(max_workers=WORKERS) as executor:
        futures = {executor.submit(process_row, row): idx for idx, row in df.iterrows()}
        
        for i, future in enumerate(as_completed(futures)):
            idx = futures[future]
            result, device_used = future.result()
            df.at[idx, 'Generated_SMILES'] = result
            df.at[idx, 'Device_Used'] = device_used
            if (i + 1) % 100 == 0 or (i + 1) == total:
                print(f"Processed molecule {i + 1}/{total}")
    
    return df

In [65]:
parecoxib_df_8_random_flips = process_dataframe(parecoxib_df_8_random_flips)
parecoxib_df_8_random_flips.to_parquet(os.path.join(OUTPUT_DIR, 'parecoxib_df_8_random_flips.parquet'), index=False)

parecoxib_df_128_random_flips = process_dataframe(parecoxib_df_128_random_flips)
parecoxib_df_128_random_flips.to_parquet(os.path.join(OUTPUT_DIR, 'parecoxib_df_128_random_flips.parquet'), index=False)

parecoxib_df_1024_random_flips = process_dataframe(parecoxib_df_1024_random_flips)
parecoxib_df_1024_random_flips.to_parquet(os.path.join(OUTPUT_DIR, 'parecoxib_df_1024_random_flips.parquet'), index=False)

Processed molecule 100/256


[23:24:21] SMILES Parse Error: unclosed ring for input: 'CCC(=O)NS(=O)(=O)C1=CC=C(C=C1)C2=C(ON=C2C3=CC=CC=C3)C4=CC=C(C=C4)C5=C6C(=C)C(=C)C(=C)C6=C'
[23:24:32] SMILES Parse Error: unclosed ring for input: 'CCC(=O)NS(=O)(=O)C1=CC=C(C=C1)C2=C3C=CC=CC3=C4C(=C(ON4)C)C5=CC=CC=C5'
[23:24:37] SMILES Parse Error: unclosed ring for input: 'CCC(=O)NS(=O)(=O)C1=CC=C(C=C1)C2=NOC3=C2CC4=C(ON=C4C5=CC=CC=C5)C'


Processed molecule 200/256
Processed molecule 256/256


[23:26:01] SMILES Parse Error: syntax error while parsing: CCC(=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=
[23:26:01] SMILES Parse Error: Failed parsing SMILES 'CCC(=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=' for input: 'CCC(=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C=C='


Processed molecule 16/16
Processed molecule 2/2


[23:26:06] SMILES Parse Error: unclosed ring for input: 'C1=C2C=C3C(=C4C(=CC(=C5C(=CC(=C6C(=CC(=C7C(=CC(=C1C2=C7)C8=CC(=C9C8(C7=C9)CC)CC)C#C9)C5)C7)C=C6)C5)C4=C3)CC)O'
[23:26:06] SMILES Parse Error: extra open parentheses for input: 'C(=C1C(=C2C(=C3C(=C4C(=C5C(=C6C(=C7C(=C(C(=C7P(C(C(C(C(C1=N2)(C(C(C(C(C(C(C(C(=C3)P(C(C(C(=C4)C)C)C)C)P(F)F)C)C)C)P(F)F)F)F)F)F)F'


In [66]:
anitrazafen_df_8_random_flips = process_dataframe(anitrazafen_df_8_random_flips)
anitrazafen_df_8_random_flips.to_parquet(os.path.join(OUTPUT_DIR, 'anitrazafen_df_8_random_flips.parquet'), index=False)

anitrazafen_df_128_random_flips = process_dataframe(anitrazafen_df_128_random_flips)
anitrazafen_df_128_random_flips.to_parquet(os.path.join(OUTPUT_DIR, 'anitrazafen_df_128_random_flips.parquet'), index=False)

anitrazafen_df_1024_random_flips = process_dataframe(anitrazafen_df_1024_random_flips)
anitrazafen_df_1024_random_flips.to_parquet(os.path.join(OUTPUT_DIR, 'anitrazafen_df_1024_random_flips.parquet'), index=False)

[23:26:12] SMILES Parse Error: unclosed ring for input: 'CC1=NC2=NC(=C(N=N1)C3=CC=C(C=C3)OC)C4=C(N=C(N=N4)C)C5=CC=C(C=C5)OC'
[23:26:14] SMILES Parse Error: unclosed ring for input: 'CC1=NC2=NC(=C(N=N1)C3=CC=C(C=C3)OC)C4=NC(=C(N=N4)C5=CC=C(C=C5)OC)C6=CC=C(C=C6)OC'
[23:26:17] SMILES Parse Error: unclosed ring for input: 'CC1=NC2=NN=C(C(=NN=C(N=N1)C3=NC(=C(N=N3)C4=CC=C(C=C4)OC)C5=CC=C(C=C5)OC)C6=CC=C(C=C6)OC)C7=CC=C(C=C7)OC'
[23:26:25] SMILES Parse Error: unclosed ring for input: 'CC1=NC2=NC(=C(N=N1)C3=CC=C(C=C3)OC)C4=CC=C(C=C4)C5=C(N=C(N=N5)C)C6=CC=C(C=C6)OC'
[23:26:30] SMILES Parse Error: unclosed ring for input: 'CC1=NC2=NC(=C(N=N1)C3=CC=C(C=C3)OC)[Se]OC4=NC(=C(N=N4)C5=CC=C(C=C5)OC)C6=CC=C(C=C6)OC'
[23:26:30] SMILES Parse Error: unclosed ring for input: 'CC1=NC2=NC(=C(N=N2)C3=CC=C(C=C3)OC)C4=NC(=C(N=N4)C5=CC=C(C=C5)OC)C6=CC=C(C=C6)OC'
[23:26:32] SMILES Parse Error: unclosed ring for input: 'CC1=NC2=NC(=C(N=N1)C3=CC=C(C=C3)OC)C4=CC=C(C=C4)OC'
[23:26:39] SMILES Parse Error: unclosed ring

[23:27:00] SMILES Parse Error: unclosed ring for input: 'CC1=NC2=NC(=C(N=N1)C3=CC=C(C=C3)OC)N=NC4=NC(=C(N=N4)C5=CC=C(C=C5)OC)C6=CC=C(C=C6)OC'


Processed molecule 100/256


[23:27:08] SMILES Parse Error: unclosed ring for input: 'CC1=NC2=NC(=C(N=N1)C3=CC=C(C=C3)OC)C4=C(N=C(N=N4)C)C5=CC=C(C=C5)OC'
[23:27:13] SMILES Parse Error: unclosed ring for input: 'CC1=NC2=NC(=C(N=N1)C3=CC=C(C=C3)OC)N=NC4=NC(=C(N=N4)C5=CC=C(C=C5)OC)C6=CC=C(C=C6)OC'
[23:27:16] SMILES Parse Error: unclosed ring for input: '[B-]1(C2=CC=C(C=C2)C3=C(N=C(N=N3)C)C4=CC=C(C=C4)OC)(C5=C(N=C(N=N5)C)C6=CC=C(C=C6)OC)C7=CC=C(C=C7)OC'
[23:27:16] SMILES Parse Error: unclosed ring for input: 'CC1=NC2=NC(=C(N=N1)C3=CC=C(C=C3)OC)N=NC4=NC(=C(N=N4)C5=CC=C(C=C5)OC)C6=CC=C(C=C6)OC'
[23:27:28] SMILES Parse Error: unclosed ring for input: 'CC1=NC2=NC(=C(N=N1)C3=CC=C(C=C3)OC)N=NC4=NC(=C(N=N4)C5=CC=C(C=C5)OC)C6=CC=C(C=C6)OC'
[23:27:28] SMILES Parse Error: unclosed ring for input: 'CC1=NC2=C(N=N1)C3=NC(=C(N=N3)C4=CC=C(C=C4)OC)C5=C(N=C(N=N5)C)C6=CC=C(C=C6)OC'
[23:27:48] SMILES Parse Error: unclosed ring for input: 'CC1=NC2=NC(=C(N=N1)C3=CC=C(C=C3)OC)N=NC4=NC(=C(N=N4)C5=CC=C(C=C5)OC)C6=CC=C(C=C6)OC'
[23:27:50] SMI

Processed molecule 200/256


[23:28:04] SMILES Parse Error: unclosed ring for input: 'CC1=NC2=C(N=N1)C3=NC(=C(N=N3)C4=CC=C(C=C4)OC)C5=C(N=C(N=N5)C)C6=CC=C(C=C6)OC'
[23:28:05] SMILES Parse Error: unclosed ring for input: 'CC1=NC2=NC(=C(N=N1)C3=CC=C(C=C3)OC)C4=NC(=C(N=N4)C5=CC=C(C=C5)OC)C6=CC=C(C=C6)OC'
[23:28:05] SMILES Parse Error: unclosed ring for input: 'CC1=NC2=C(N=N1)C3=NC(=C(N=N3)C4=CC=C(C=C4)OC)N=NC5=NC(=C(N=N5)C6=CC=C(C=C6)OC)C7=CC=C(C=C7)OC'
[23:28:16] SMILES Parse Error: unclosed ring for input: 'CC1=NC2=NC(=C(N=N1)C3=CC=C(C=C3)OC)N=NC4=NC(=C(N=N4)C5=CC=C(C=C5)OC)C6=CC=C(C=C6)OC'
[23:28:17] SMILES Parse Error: unclosed ring for input: 'CC1=NC(=C(N=N1)C2=CC=C(C=C2)OC)[N+]3(C4=NC(=C(N=C4C5=CC=C(C=C5)OC)C6=CC=C(C=C6)OC)C7=CC=C(C=C7)OC)C8=NC(=NN=N8)C'
[23:28:22] SMILES Parse Error: unclosed ring for input: 'CC1=NC2=NC(=C(N=N1)C3=CC=C(C=C3)OC)C4=C(N=C(N=N4)OC)C5=CC=C(C=C5)OC'
[23:28:23] SMILES Parse Error: unclosed ring for input: 'CC1=NC2=NC(=C(N=N1)C3=CC=C(C=C3)OC)N=NC4=NC(=C(N=N4)C5=CC=C(C=C5)OC)C6=CC=C(C=

Processed molecule 256/256


[23:28:37] SMILES Parse Error: unclosed ring for input: 'COC1=CC=C(C=C1)C2=C(C(=NN=N2)C3=CC=C(C=C3)OC)C4=CC=C(C=C4)C5=C6C(=C7C(=NN=C(N7)OC)C=CS8=CC=CS(=N8)OC)C=C=[NH2+]6'
[23:28:40] SMILES Parse Error: unclosed ring for input: 'COC1=CC=C(C=C1)C2=C3C=[N+](C=C3)C4=CC=C(C=C4)C5=C6C=CC=C(P6C7=CC=CC=C7P5OCC8=CC=C9C=C[N+](=CC=C9)C8=N7)OC'
[23:28:40] SMILES Parse Error: extra close parentheses while parsing: COC1=CC=C(C=C1)C2=C(N=NC(=N2)CC(=C3C(=C4C=CC(=C5C(=C6C=C7C=CC=C(C7=N6)C5(C)C)C8=CC=C(C=C8)OC)C=C4)N=N3)OC)C9=CC=C(C=C9)O)O
[23:28:40] SMILES Parse Error: Failed parsing SMILES 'COC1=CC=C(C=C1)C2=C(N=NC(=N2)CC(=C3C(=C4C=CC(=C5C(=C6C=C7C=CC=C(C7=N6)C5(C)C)C8=CC=C(C=C8)OC)C=C4)N=N3)OC)C9=CC=C(C=C9)O)O' for input: 'COC1=CC=C(C=C1)C2=C(N=NC(=N2)CC(=C3C(=C4C=CC(=C5C(=C6C=C7C=CC=C(C7=N6)C5(C)C)C8=CC=C(C=C8)OC)C=C4)N=N3)OC)C9=CC=C(C=C9)O)O'
[23:28:46] SMILES Parse Error: unclosed ring for input: 'COC1=CC=C(C=C1)C2=C(C(=NN=N2)C3=CC=C(C=C3)OC)[NH+]=C4C(C(=C(C4=C5C(=C6C(=N)C(=C7C(=N)C(=CN=N7)S)C(=N)

Processed molecule 16/16
Processed molecule 2/2


[23:28:48] SMILES Parse Error: syntax error while parsing: C1=C2C(=C3C1=C4C5=C6C(=C7C(C(C(C7=C8C(C(C(C(C8=C9C(=C7S9)C#CC(C(=C8[S-])C#C9)C(C(=C2)[S-])[S-])C(C(=C4)[S-])[S-])[S-])(C(C(C)C)[S-])[S-])[S-])(C(C(C(C(C(C(=
[23:28:48] SMILES Parse Error: Failed parsing SMILES 'C1=C2C(=C3C1=C4C5=C6C(=C7C(C(C(C7=C8C(C(C(C(C8=C9C(=C7S9)C#CC(C(=C8[S-])C#C9)C(C(=C2)[S-])[S-])C(C(=C4)[S-])[S-])[S-])(C(C(C)C)[S-])[S-])[S-])(C(C(C(C(C(C(=' for input: 'C1=C2C(=C3C1=C4C5=C6C(=C7C(C(C(C7=C8C(C(C(C(C8=C9C(=C7S9)C#CC(C(=C8[S-])C#C9)C(C(=C2)[S-])[S-])C(C(=C4)[S-])[S-])[S-])(C(C(C)C)[S-])[S-])[S-])(C(C(C(C(C(C(='


In [67]:
celecoxib_df_8_random_flips = process_dataframe(celecoxib_df_8_random_flips)
celecoxib_df_8_random_flips.to_parquet(os.path.join(OUTPUT_DIR, 'celecoxib_df_8_random_flips.parquet'), index=False)

celecoxib_df_128_random_flips = process_dataframe(celecoxib_df_128_random_flips)
celecoxib_df_128_random_flips.to_parquet(os.path.join(OUTPUT_DIR, 'celecoxib_df_128_random_flips.parquet'), index=False)

celecoxib_df_1024_random_flips = process_dataframe(celecoxib_df_1024_random_flips)
celecoxib_df_1024_random_flips.to_parquet(os.path.join(OUTPUT_DIR, 'celecoxib_df_1024_random_flips.parquet'), index=False)


Processed molecule 100/256


[23:30:22] SMILES Parse Error: extra open parentheses for input: 'CC1=CC=C(C=C1)C2=CC(=NN2C3=CC=C(C=C3)S(=O)(=O)N)C(C4=CC=C(C=C4)C)(C5=CC=C(C=C5)C)C6=CC(=NN6C7=CC=C(C=C7)S(=O)(=O)N)C(F)(F'


Processed molecule 200/256
Processed molecule 256/256


[23:31:22] SMILES Parse Error: unclosed ring for input: 'CSC1=CN(N=N1)S(=O)(=O)SC2=CC(=CC=C2)C3=NN=NN3S(=O)(=O)S(=O)(=O)N4N=C5C=C(C=C(N5O)SN)S(=O)(=O)N'
[23:31:22] SMILES Parse Error: unclosed ring for input: 'C[NH2+]/C(=C\1/C=NS(=N)N1C2=NS(=N)N)/C3=CC(=C(C=C3)OS(=O)(=O)N)CS(=N)(=O)C4=CC=C(C=C4)C5=N/C(=C/6\C=NS(=N)N6)/C=N5'
[23:31:22] SMILES Parse Error: extra close parentheses while parsing: CN\1C=CC(=N/C1=C/2\C=CC(=N2)C(N=C3C=CC(=N3)C(=O)S(=O)(=O)C4=CC=C(C=C4)S(=O)(=O)C5=NN=C(C=C5)C(N=N6)(N7C=CC(=N7)C)N=N5)(N=O)C)C)(=
[23:31:22] SMILES Parse Error: Failed parsing SMILES 'CN\1C=CC(=N/C1=C/2\C=CC(=N2)C(N=C3C=CC(=N3)C(=O)S(=O)(=O)C4=CC=C(C=C4)S(=O)(=O)C5=NN=C(C=C5)C(N=N6)(N7C=CC(=N7)C)N=N5)(N=O)C)C)(=' for input: 'CN\1C=CC(=N/C1=C/2\C=CC(=N2)C(N=C3C=CC(=N3)C(=O)S(=O)(=O)C4=CC=C(C=C4)S(=O)(=O)C5=NN=C(C=C5)C(N=N6)(N7C=CC(=N7)C)N=N5)(N=O)C)C)(='
[23:31:25] SMILES Parse Error: unclosed ring for input: 'C[Si](C)(C)N=C=C=C=C1C2=CC(=C(C(=C2)P(=O)=S)P(=O)=S)C3=CC(=C(C(=C3)S(=O)(=O)N)P(=O)=S)P(=

Processed molecule 16/16
Processed molecule 2/2


[23:31:31] SMILES Parse Error: syntax error while parsing: C1=C2C=C3C(=C4C(=CC(=C5C(C(=CC(=C6C(C(=CC(=C1C(=O)OC(C(F)(F)F)C(F)(F)F)C(F)(F)F)S6)C(F)(F)F)S5)C(F)(F)F)S4)S3)C(=P2)C(=C(C(C(=C(=
[23:31:31] SMILES Parse Error: Failed parsing SMILES 'C1=C2C=C3C(=C4C(=CC(=C5C(C(=CC(=C6C(C(=CC(=C1C(=O)OC(C(F)(F)F)C(F)(F)F)C(F)(F)F)S6)C(F)(F)F)S5)C(F)(F)F)S4)S3)C(=P2)C(=C(C(C(=C(=' for input: 'C1=C2C=C3C(=C4C(=CC(=C5C(C(=CC(=C6C(C(=CC(=C1C(=O)OC(C(F)(F)F)C(F)(F)F)C(F)(F)F)S6)C(F)(F)F)S5)C(F)(F)F)S4)S3)C(=P2)C(=C(C(C(=C(='


In [68]:
cimicoxib_df_8_random_flips = process_dataframe(cimicoxib_df_8_random_flips)
cimicoxib_df_8_random_flips.to_parquet(os.path.join(OUTPUT_DIR, 'cimicoxib_df_8_random_flips.parquet'), index=False)

cimicoxib_df_128_random_flips = process_dataframe(cimicoxib_df_128_random_flips)
cimicoxib_df_128_random_flips.to_parquet(os.path.join(OUTPUT_DIR, 'cimicoxib_df_128_random_flips.parquet'), index=False)

cimicoxib_df_1024_random_flips = process_dataframe(cimicoxib_df_1024_random_flips)
cimicoxib_df_1024_random_flips.to_parquet(os.path.join(OUTPUT_DIR, 'cimicoxib_df_1024_random_flips.parquet'), index=False)

Processed molecule 100/256
Processed molecule 200/256
Processed molecule 256/256


[23:34:00] SMILES Parse Error: extra close parentheses while parsing: B(N)(N)OS(=O)(=O)C1=CC(=CC(=C1)C2=NC(C=C2)OC)C3=N/C(=C(\C)/C4=CC(=C(N4C3=N/C(=C/5\C=CC(C=C5)OC)/C)C6=CC(=C(C(=C6)N)C)Cl)Cl)Cl)Cl)Cl
[23:34:00] SMILES Parse Error: Failed parsing SMILES 'B(N)(N)OS(=O)(=O)C1=CC(=CC(=C1)C2=NC(C=C2)OC)C3=N/C(=C(\C)/C4=CC(=C(N4C3=N/C(=C/5\C=CC(C=C5)OC)/C)C6=CC(=C(C(=C6)N)C)Cl)Cl)Cl)Cl)Cl' for input: 'B(N)(N)OS(=O)(=O)C1=CC(=CC(=C1)C2=NC(C=C2)OC)C3=N/C(=C(\C)/C4=CC(=C(N4C3=N/C(=C/5\C=CC(C=C5)OC)/C)C6=CC(=C(C(=C6)N)C)Cl)Cl)Cl)Cl)Cl'
[23:34:02] Explicit valence for atom # 25 C, 5, is greater than permitted
[23:34:08] SMILES Parse Error: ring closure 1 duplicates bond between atom 67 and atom 69 for input: 'COOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOC1(O)C1(O)C1'


Processed molecule 16/16


[23:34:10] SMILES Parse Error: unclosed ring for input: 'C1=C2C(=C3C4=C5C(=C6C=C7C=CC(=C6)C=C7)C=C5C=C4)C(=C1)C2=C3P(=C8C=CC(=C9C=CC(=C7)C=C8)[S-])C1=CC=C9'


Processed molecule 2/2


[23:34:11] SMILES Parse Error: extra open parentheses for input: 'C1=C(C=C(C1=S)C(C(F)(F)F)(C(F)(F)F)SC(=S)C2=C(C(=C(S2)C(C(F)(F)F)(C(F)(F)F)C(F)(F)F)[Se]C(C(F)(F)F)(C(F)(F)F)C(F)(F)F)[Se]C(=S)C(C(=S)C'


In [69]:
deracoxib_df_8_random_flips = process_dataframe(deracoxib_df_8_random_flips)
deracoxib_df_8_random_flips.to_parquet(os.path.join(OUTPUT_DIR, 'deracoxib_df_8_random_flips.parquet'), index=False)

deracoxib_df_128_random_flips = process_dataframe(deracoxib_df_128_random_flips)
deracoxib_df_128_random_flips.to_parquet(os.path.join(OUTPUT_DIR, 'deracoxib_df_128_random_flips.parquet'), index=False)

deracoxib_df_1024_random_flips = process_dataframe(deracoxib_df_1024_random_flips)
deracoxib_df_1024_random_flips.to_parquet(os.path.join(OUTPUT_DIR, 'deracoxib_df_1024_random_flips.parquet'), index=False)

Processed molecule 100/256
Processed molecule 200/256
Processed molecule 256/256


[23:36:45] SMILES Parse Error: unclosed ring for input: 'CC[NH+]=C1C=C(C=C1)C2=NN=C(C=C2)C(C3=NN=C(C=C3)[NH+]=CN)C4=NN=C(C=C4)S(=O)(=O)C5=CC(=CC(=[NH+]N)N)[NH+]=NN'
[23:36:48] SMILES Parse Error: unclosed ring for input: 'COC1=CC(=C[N+](=C1)C2=C(C(=C(C(=C2)F)N3N4N3C5=C(C=C4C6=CC(=NN=N6)COO)OO)OO)OO)C7=CC(=NN=N7)COO'
[23:36:51] SMILES Parse Error: unclosed ring for input: '[B](C1=CC=C(C=C1)S(=O)(=O)N)C(C2=NN=NN2)C3=CC(=CC(=C3)[O-])C4=CC(=C(C(=C5N(N=NN=N5)OC)S(=O)(=O)C)S(=O)(=O)C)S(=O)(=O)C'


Processed molecule 16/16
Processed molecule 2/2


[23:36:53] SMILES Parse Error: unclosed ring for input: 'C1=C2C=C3C(=C4C1=C5C6=C7C(=CC(=C6)C(=C8C(C(=CC(=C9C8=CC(=N2)C#C9)C(C(F)(F)F)C(F)(F)F)C(=C7)[S-])C8)C(F)(F)F)S5)SC(=N4)[S-]'
