# Project: Decoding Molecules From Fingerprints.
## Group Members:
### Qi Chen, e-mail: gusqichr@student.gu.se
### Nils Dunlop, e-mail: gusdunlni@student.gu.se
### Francisco Alejandro Erazo Piza, e-mail: guserafr@student.gu.se
***

In [9]:
import os
import pandas as pd
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem import PandasTools
import numpy as np
from joblib import Parallel, delayed
from rdkit.DataStructs import TanimotoSimilarity
import subprocess
from typing import List, Tuple
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed
from rdkit import Chem
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

### Define directories
***

In [10]:
# Set user directory
USER_DIR = os.path.expanduser('~')

# Set the project root directory
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Define the input and output directories
INPUT_DIR = os.path.join(PROJECT_ROOT, 'data/')
OUTPUT_DIR = os.path.join(PROJECT_ROOT, 'data/bit_flipping_nn_train_molecule')
INPUT_PATH = os.path.join(PROJECT_ROOT, 'data/processed_chunks/processed_molecule_fingerprints_part_0.parquet')

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)


### COX2 and Janus Data
***

In [11]:
# Read
df = pd.read_parquet(INPUT_PATH)
print(f"COX2 shape: {df.shape}")


COX2 shape: (1000, 6)


In [12]:
df.head()

Unnamed: 0,ChEMBL ID,smiles,FingerprintBits,SparseFingerprintBits,Converted_SMILES,Device_Used
0,CHEMBL3897759,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5ccc(F)c(F)c5)CC4)cc3)nc(N(C)C)nc2c1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",80 191 216 235 255 263 310 363 378 380 467 531 548 638 650 689 699 708 744 784 785 807 827 843 875 881 926 933 935 974 1013 1019 1057 1077 1097 1106 1152 1163 1261 1275 1309 1325 1357 1380 1416 1422 1480 1539 1693 1700 1722 1733 1750 1754 1765 1771 1791 1816 1823 1837 1840 1855 1873 1917 1928 2004 2016,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5ccc(F)c(F)c5)CC4)cc3)nc(N(C)C)nc2c1,cuda
1,CHEMBL3935110,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5cc(F)ccc5F)CC4)cc3)nc(N(C)C)nc2c1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",80 191 216 235 249 255 263 310 363 378 380 467 548 566 638 650 689 699 708 784 785 807 843 875 881 926 933 935 974 1013 1019 1057 1068 1077 1097 1106 1152 1163 1261 1275 1309 1325 1349 1357 1380 1410 1416 1422 1480 1487 1670 1693 1700 1722 1733 1750 1754 1765 1791 1792 1816 1823 1837 1840 1855 1873 1917 1928 2004 2016,Cc1ccc2c(NCc3ccc(NC(=O)C4CCN(Cc5cc(F)ccc5F)CC4)cc3)nc(N(C)C)nc2c1,cuda
2,CHEMBL3347413,N#Cc1cc(Cl)cc(Oc2cc(CCc3cccnc3)[nH]c(=O)c2Cl)c1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",24 80 102 216 240 301 305 314 322 378 437 499 561 582 645 650 674 695 726 750 790 875 892 929 1011 1031 1043 1066 1087 1088 1103 1114 1120 1136 1154 1243 1261 1292 1380 1384 1535 1586 1603 1617 1683 1740 1745 1747 1750 1754 1866 1873 1970,N#Cc1cc(Cl)cc(Oc2cc(CCc3cccnc3)[nH]c(=O)c2Cl)c1,cuda
3,CHEMBL1739263,N=C(N)c1ccc(-c2sc(-c3ccc(C(=N)N)cc3)c3c2OCCO3)cc1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",82 140 162 338 441 621 623 656 675 807 854 876 896 926 950 1104 1160 1171 1294 1326 1380 1385 1596 1697 1701 1750 1817 1860 1873,N=C(N)c1ccc(-c2sc(-c3ccc(C(=N)N)cc3)c3c2OCCO3)cc1,cuda
4,CHEMBL3917493,O=C(C=Cc1cccs1)c1cccc(NC(=O)c2cc(-c3ccncc3)[nH]n2)c1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]",12 14 31 63 74 119 184 191 241 378 383 391 486 552 595 650 675 694 703 747 804 807 813 835 855 875 935 1039 1053 1088 1114 1152 1160 1208 1324 1380 1385 1509 1567 1603 1667 1683 1713 1728 1750 1816 1817 1866 1873 1899 1917 1978 1985 2004,O=C(C=Cc1cccs1)c1cccc(NC(=O)c2cc(-c3ccncc3)[nH]n2)c1,cuda


In [13]:
df = df.rename(columns={'smiles': 'SMILES'})


### Extract Nearby Neighbors of parecoxib
***

In [14]:
chembl_3897759 = df[df['ChEMBL ID'] == 'CHEMBL3897759']

chembl_3347413 = df[df['ChEMBL ID'] == 'CHEMBL3347413']

### Flip Bits
***

In [15]:
def flip_single_bit(vector: List[int], position: int) -> List[int]:
    """Flip only the bit at the given position in the vector."""
    flipped_vector = vector.copy()
    flipped_vector[position] = 1 - flipped_vector[position]
    return flipped_vector

def sequential_bit_flipping(fingerprint: List[int], start: int = None, end: int = None) -> List[Tuple[int, List[int]]]:
    """Perform sequential single bit flipping on the fingerprint from MSB to LSB."""
    if start is None:
        start = 0
    if end is None:
        end = len(fingerprint)
    
    results = []
    for i in range(start, end):  # Forward order: from left (MSB) to right (LSB)
        flipped_fingerprint = flip_single_bit(fingerprint, i)
        results.append((i, flipped_fingerprint))
    return results

def block_based_flipping(fingerprint: List[int], block_size: int, block_number: int = None) -> List[Tuple[int, List[int]]]:
    """Perform block-based single bit flipping on the fingerprint from MSB to LSB."""
    total_blocks = len(fingerprint) // block_size
    if block_number is not None:
        if block_number < 0 or block_number >= total_blocks:
            raise ValueError(f"Block number must be between 0 and {total_blocks - 1}")
        start = block_number * block_size
        end = start + block_size
        return sequential_bit_flipping(fingerprint, start, end)
    else:
        results = []
        for start in range(0, len(fingerprint), block_size):
            end = min(start + block_size, len(fingerprint))
            results.extend(sequential_bit_flipping(fingerprint, start, end))
        return results

def sparse_to_dense(sparse_fingerprint: List[int], size: int = 2048) -> List[int]:
    """Convert sparse fingerprint to dense fingerprint."""
    dense = [0] * size
    for bit in sparse_fingerprint:
        dense[bit] = 1
    return dense

def dense_to_sparse(dense_fingerprint: List[int]) -> List[int]:
    """Convert dense fingerprint to sparse fingerprint."""
    return [i for i, bit in enumerate(dense_fingerprint) if bit == 1]

def generate_flipped_fingerprints(row: pd.Series, block_size: int = None, block_number: int = None) -> List[dict]:
    """Generate single bit flipped fingerprints for a single row."""
    dense_fingerprint = row['FingerprintBits']
    sparse_fingerprint = row['SparseFingerprintBits']
    
    if block_size:
        flipped_results = block_based_flipping(dense_fingerprint, block_size, block_number)
    else:
        flipped_results = sequential_bit_flipping(dense_fingerprint)
    
    new_rows = []
    for position, flipped_dense in flipped_results:
        flipped_sparse = dense_to_sparse(flipped_dense)
        new_row = {
            'ChEMBL ID': row['ChEMBL ID'],
            'SMILES': row['SMILES'],
            'FingerprintBits': dense_fingerprint,
            'SparseFingerprintBits': sparse_fingerprint,
            'FingerprintBitsFlipped': flipped_dense,
            'SparseFingerprintBitsFlipped': flipped_sparse,
            'FlippedBitPosition': position
        }
        new_rows.append(new_row)
    
    return new_rows

def process_dataframe(df: pd.DataFrame, block_size: int = None, block_number: int = None) -> pd.DataFrame:
    """Process the entire dataframe and generate single bit flipped fingerprints."""
    all_new_rows = []
    for _, row in df.iterrows():
        new_rows = generate_flipped_fingerprints(row, block_size, block_number)
        all_new_rows.extend(new_rows)
    
    return pd.DataFrame(all_new_rows)


In [16]:
chembl_3897759_flipped_df = process_dataframe(chembl_3897759)
chembl_3347413_flipped_df = process_dataframe(chembl_3347413)


In [17]:
chembl_3897759_flipped_df.shape


(2048, 7)

### MolForge predictions on flipped fingerprints
***


In [18]:
# Set user directory
USER_DIR = os.path.expanduser('~')

# Path to the conda environment to MolForge
CONDA_ENV_PATH = os.path.join(USER_DIR, 'conda/envs/molforge/bin/python')
MOLFORGE_DIR = os.path.join(USER_DIR, 'MolForge')
MOLFORGE_SCRIPT_PATH = os.path.join(USER_DIR, 'MolForge/predict.py')
WORKERS = 4

In [19]:
def convert_spaced_smiles(spaced_smiles):
    compact_smiles = spaced_smiles.replace(" ", "")
    mol = Chem.MolFromSmiles(compact_smiles)
    if mol is not None:
        return Chem.MolToSmiles(mol, isomericSmiles=False)
    else:
        return "Invalid SMILES string"

In [20]:
def run_molforge(fp_input):
    # Ensure fp_input is a string
    if isinstance(fp_input, list):
        fp_input_str = ' '.join(map(str, fp_input))
    else:
        fp_input_str = fp_input
    
    command = [
        CONDA_ENV_PATH,
        MOLFORGE_SCRIPT_PATH,
        "--fp=ECFP4",
        "--model_type=smiles",
        f"--input={fp_input_str}"
    ]
    
    try:
        result = subprocess.run(
            command,
            capture_output=True,
            text=True,
            check=True,
            cwd=MOLFORGE_DIR,
            env=os.environ.copy()
        )
    except subprocess.CalledProcessError as e:
        print(f"Error running MolForge: {e}")
        print(f"Error output: {e.stderr}")
        return "Error", "Error"
    
    spaced_smiles = None
    device_used = None
    output_lines = result.stdout.splitlines()
    
    for line in output_lines:
        if "Result:" in line:
            spaced_smiles = line.split("Result:")[1].strip()
        if "rank :" in line:
            device_used = line.split("rank :")[1].strip()
    
    if spaced_smiles:
        compact_smiles = convert_spaced_smiles(spaced_smiles)
    else:
        compact_smiles = "No Result"

    return compact_smiles, device_used

In [21]:
def process_row(row):
    fp_input = row['SparseFingerprintBitsFlipped']
    compact_smiles, device_used = run_molforge(fp_input)
    return compact_smiles, device_used

def process_dataframe(df, max_rows=None):
    if max_rows:
        df = df.head(max_rows)
    
    total = len(df)
    with ThreadPoolExecutor(max_workers=WORKERS) as executor:
        futures = {executor.submit(process_row, row): idx for idx, row in df.iterrows()}
        
        for i, future in enumerate(as_completed(futures)):
            idx = futures[future]
            result, device_used = future.result()
            df.at[idx, 'Generated_SMILES'] = result
            df.at[idx, 'Device_Used'] = device_used
            if (i + 1) % 100 == 0 or (i + 1) == total:
                print(f"Processed molecule {i + 1}/{total}")
    
    return df

In [22]:
chembl_3347413_flipped_df = process_dataframe(chembl_3347413_flipped_df)
chembl_3347413_flipped_df.to_parquet(os.path.join(OUTPUT_DIR, 'chembl_3347413_flipped_df.parquet'), index=False)

Processed molecule 100/2048
Processed molecule 200/2048
Processed molecule 300/2048
Processed molecule 400/2048
Processed molecule 500/2048
Processed molecule 600/2048
Processed molecule 700/2048
Processed molecule 800/2048
Processed molecule 900/2048
Processed molecule 1000/2048
Processed molecule 1100/2048
Processed molecule 1200/2048
Processed molecule 1300/2048
Processed molecule 1400/2048
Processed molecule 1500/2048
Processed molecule 1600/2048
Processed molecule 1700/2048
Processed molecule 1800/2048
Processed molecule 1900/2048
Processed molecule 2000/2048
Processed molecule 2048/2048


In [23]:
chembl_3897759_flipped_df = process_dataframe(chembl_3897759_flipped_df)
chembl_3897759_flipped_df.to_parquet(os.path.join(OUTPUT_DIR, 'chembl_3897759_flipped_df.parquet'), index=False)

Processed molecule 100/2048
Processed molecule 200/2048
Processed molecule 300/2048
Processed molecule 400/2048
Processed molecule 500/2048
Processed molecule 600/2048
Processed molecule 700/2048
Processed molecule 800/2048
Processed molecule 900/2048
Processed molecule 1000/2048
Processed molecule 1100/2048
Processed molecule 1200/2048
Processed molecule 1300/2048
Processed molecule 1400/2048
Processed molecule 1500/2048
Processed molecule 1600/2048
Processed molecule 1700/2048
Processed molecule 1800/2048
Processed molecule 1900/2048
Processed molecule 2000/2048
Processed molecule 2048/2048
