# Project: Decoding Molecules From Fingerprints.
## Group Members:
### Qi Chen, e-mail: gusqichr@student.gu.se
### Nils Dunlop, e-mail: gusdunlni@student.gu.se
### Francisco Alejandro Erazo Piza, e-mail: guserafr@student.gu.se
***

In [1]:
import os
import pandas as pd
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed
from rdkit import Chem

In [2]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"CUDA device name: {torch.cuda.get_device_name(0)}")

CUDA available: True
CUDA device count: 1
CUDA device name: NVIDIA GeForce RTX 4090


### Define directories
***

In [3]:
# Set user directory
USER_DIR = os.path.expanduser('~')

# Set the project root directory
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Define the input and output directories
INPUT_DIR = os.path.join(PROJECT_ROOT, 'data/chunks')
OUTPUT_DIR = os.path.join(PROJECT_ROOT, 'data/processed_chunks')
FINAL_OUTPUT = os.path.join(PROJECT_ROOT, 'data/final_processed_data.parquet')

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Path to the conda environment to MolForge
CONDA_ENV_PATH = os.path.join(USER_DIR, 'conda/envs/molforge/bin/python')
MOLFORGE_SCRIPT_PATH = os.path.join(PROJECT_ROOT, 'MolForge/predict.py')

# Define the number of workers
WORKERS = 15

### Check parquet files
***

In [4]:
for file in os.listdir(INPUT_DIR):
    if file.endswith('.parquet'):
        df = pd.read_parquet(os.path.join(INPUT_DIR, file))
        print(f"{file} shape: {df.shape}")


molecule_fingerprints_part_6.parquet shape: (182680, 4)
molecule_fingerprints_part_1.parquet shape: (200000, 4)
molecule_fingerprints_part_4.parquet shape: (200000, 4)
molecule_fingerprints_part_3.parquet shape: (200000, 4)
molecule_fingerprints_part_2.parquet shape: (200000, 4)
molecule_fingerprints_part_0.parquet shape: (200000, 4)
molecule_fingerprints_part_5.parquet shape: (200000, 4)


#### Compute MolForge Predictions
***


In [5]:
def convert_spaced_smiles(spaced_smiles):
    compact_smiles = spaced_smiles.replace(" ", "")
    mol = Chem.MolFromSmiles(compact_smiles)
    if mol is not None:
        return Chem.MolToSmiles(mol, isomericSmiles=False)
    else:
        return "Invalid SMILES string"

In [6]:
def run_molforge(fp_input):
    # Get the full path to the Python interpreter you use in the terminal
    command = f"{CONDA_ENV_PATH} {MOLFORGE_SCRIPT_PATH} --fp='ECFP4' --model_type='smiles' --input='{fp_input}'"
    
    # Run the command and capture output, ensuring environment variables are inherited
    result = subprocess.run(command, shell=True, capture_output=True, text=True, cwd=os.path.dirname(MOLFORGE_SCRIPT_PATH), env=os.environ.copy(), timeout=300)
    
    # Process the result
    spaced_smiles = None
    device_used = None
    output_lines = result.stdout.splitlines()
    
    for line in output_lines:
        if "Result:" in line:
            spaced_smiles = line.split("Result:")[1].strip()
        if "rank :" in line:
            device_used = line.split("rank :")[1].strip()
            
    # Now convert the spaced SMILES to compact SMILES
    if spaced_smiles:
        compact_smiles = convert_spaced_smiles(spaced_smiles)
    else:
        compact_smiles = "No Result"

    return compact_smiles, device_used

In [7]:
'''
def run_molforge(fp_input):
    command = f"{CONDA_ENV_PATH} {MOLFORGE_SCRIPT_PATH} --fp='ECFP4' --model_type='smiles' --input='{fp_input}'"
    try:
        result = subprocess.run(command, shell=True, capture_output=True, text=True, cwd=os.path.dirname(MOLFORGE_SCRIPT_PATH), env=os.environ.copy(), timeout=300)
        print(f"Command output: {result.stdout}")
        print(f"Command error: {result.stderr}")
        if result.returncode != 0:
            print(f"Command failed with return code {result.returncode}")
            return "Error", "None"
    except subprocess.TimeoutExpired:
        print(f"Timeout occurred for input: {fp_input}")
        return "Timeout", "None"
    except Exception as e:
        print(f"Unexpected error: {e}")
        return "Error", "None"
    
    spaced_smiles = None
    device_used = None
    output_lines = result.stdout.splitlines()
    
    for line in output_lines:
        if "Result:" in line:
            spaced_smiles = line.split("Result:")[1].strip()
        if "rank :" in line:
            device_used = line.split("rank :")[1].strip()
    
    if spaced_smiles:
        compact_smiles = convert_spaced_smiles(spaced_smiles)
    else:
        compact_smiles = "No Result"

    return compact_smiles, device_used
'''

'\ndef run_molforge(fp_input):\n    command = f"{CONDA_ENV_PATH} {MOLFORGE_SCRIPT_PATH} --fp=\'ECFP4\' --model_type=\'smiles\' --input=\'{fp_input}\'"\n    try:\n        result = subprocess.run(command, shell=True, capture_output=True, text=True, cwd=os.path.dirname(MOLFORGE_SCRIPT_PATH), env=os.environ.copy(), timeout=300)\n        print(f"Command output: {result.stdout}")\n        print(f"Command error: {result.stderr}")\n        if result.returncode != 0:\n            print(f"Command failed with return code {result.returncode}")\n            return "Error", "None"\n    except subprocess.TimeoutExpired:\n        print(f"Timeout occurred for input: {fp_input}")\n        return "Timeout", "None"\n    except Exception as e:\n        print(f"Unexpected error: {e}")\n        return "Error", "None"\n    \n    spaced_smiles = None\n    device_used = None\n    output_lines = result.stdout.splitlines()\n    \n    for line in output_lines:\n        if "Result:" in line:\n            spaced_smi

In [8]:
def process_row(row):
    fp_input = row['SparseFingerprintBits']
    
    # Run MolForge and get the converted SMILES and device information
    compact_smiles, device_used = run_molforge(fp_input)
    
    return compact_smiles, device_used

In [9]:
def process_chunk(file_path):
    chunk_df = pd.read_parquet(file_path)
    total = len(chunk_df)
    with ThreadPoolExecutor(max_workers=WORKERS) as executor:
        futures = {executor.submit(process_row, row): idx for idx, row in chunk_df.iterrows()}
        
        for i, future in enumerate(as_completed(futures)):
            idx = futures[future]
            try:
                result, device_used = future.result()
                chunk_df.at[idx, 'Converted_SMILES'] = result
                chunk_df.at[idx, 'Device_Used'] = device_used
                if (i + 1) % 100 == 0 or (i + 1) == total:
                    print(f"Processed molecule {i + 1}/{total} in {os.path.basename(file_path)}")
            except Exception as e:
                print(f"Error processing molecule {idx + 1} in {os.path.basename(file_path)}: {e}")
                chunk_df.at[idx, 'Converted_SMILES'] = 'Error'
                chunk_df.at[idx, 'Device_Used'] = 'Error'
    
    output_file = os.path.join(OUTPUT_DIR, f"processed_{os.path.basename(file_path)}")
    chunk_df.to_parquet(output_file, index=False)
    print(f"Finished processing {os.path.basename(file_path)}")

In [10]:
# Process all chunk files
chunk_files = [f for f in os.listdir(INPUT_DIR) if f.endswith('.parquet')]
total_chunks = len(chunk_files)
for i, file in enumerate(chunk_files, 1):
    file_path = os.path.join(INPUT_DIR, file)
    print(f"Processing chunk {i}/{total_chunks}: {file}")
    process_chunk(file_path)
    print(f"Completed chunk {i}/{total_chunks}")

Processing chunk 1/7: molecule_fingerprints_part_6.parquet


[08:26:32] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7


Processed molecule 100/182680 in molecule_fingerprints_part_6.parquet


[08:27:17] Can't kekulize mol.  Unkekulized atoms: 18 19 20 24 25 26 27


Processed molecule 200/182680 in molecule_fingerprints_part_6.parquet


[08:27:44] SMILES Parse Error: unclosed ring for input: 'CCOC(=O)C[C@@]1([C@@H]2CC[C@@H]3[C@H]4[C@@]5(CCC([C@H]([C@@H]5CC[C@@]4([C@@]2(CC[C@@H]1C(C)(C)C(=O)OCC)C)C)OC3)(C)C)CO4)C'


Processed molecule 300/182680 in molecule_fingerprints_part_6.parquet
Processed molecule 400/182680 in molecule_fingerprints_part_6.parquet


[08:28:40] SMILES Parse Error: unclosed ring for input: 'C[C@H]1C[C@@H]([C@@H]2[C@@H](C3=C1C(=O)C=C3C)OC(=O)[C@]4([C@H](O4)C)C)OC(=O)[C@]5([C@@H](O5)C)C'
[08:28:51] Can't kekulize mol.  Unkekulized atoms: 0 1 3 4 8 9 10


Processed molecule 500/182680 in molecule_fingerprints_part_6.parquet


[08:29:15] Can't kekulize mol.  Unkekulized atoms: 28 29 30 31 32


Processed molecule 600/182680 in molecule_fingerprints_part_6.parquet


[08:30:06] SMILES Parse Error: unclosed ring for input: 'C[C@H]1C(=O)C2=C([C@@]1(C)CO)[C@@H]3CC[C@H]4[C@@](CCC[C@@]4(C3)O)(C)CO'


Processed molecule 700/182680 in molecule_fingerprints_part_6.parquet


[08:30:58] SMILES Parse Error: unclosed ring for input: 'C[C@@H](NC(=O)Nc1cc2[nH]nc3N(CCc2n1)C(C)C)c1ccccc1'


Processed molecule 800/182680 in molecule_fingerprints_part_6.parquet


[08:31:30] SMILES Parse Error: unclosed ring for input: 'C[C@@]12CC[C@@H]3[C@@](C1=O)(CC[C@@H]4[C@@]3(C=CC(=O)OC4(C)C)C)C[C@]56C=C7C(=O)OC([C@@H]8[C@@]7(C5)C=CC(=O)OC8(C)C)(C)C'


Processed molecule 900/182680 in molecule_fingerprints_part_6.parquet
Processed molecule 1000/182680 in molecule_fingerprints_part_6.parquet
Processed molecule 1100/182680 in molecule_fingerprints_part_6.parquet
Processed molecule 1200/182680 in molecule_fingerprints_part_6.parquet


[08:33:23] SMILES Parse Error: unclosed ring for input: 'Cc1ccncc1C(=O)N1C[C@@H]2C[C@H](CN3CC4CCC4)[C@@]2(C1)C(=O)O'


Processed molecule 1300/182680 in molecule_fingerprints_part_6.parquet


[08:34:30] SMILES Parse Error: unclosed ring for input: 'C[C@H](CC/C=C(\C)/C(=O)O)[C@H]1CC[C@@H]2C1=C[C@H]3C(=C)[C@@H]4CCCC4(CCC(=O)OC3(C)C)C'


Processed molecule 1400/182680 in molecule_fingerprints_part_6.parquet
Processed molecule 1500/182680 in molecule_fingerprints_part_6.parquet
Processed molecule 1600/182680 in molecule_fingerprints_part_6.parquet


[08:35:51] SMILES Parse Error: unclosed ring for input: 'NC(=O)c1c(N)n(nc1c1ccc(Oc2ccc(Cl)cn2)cc1)C1CCC(CC1)(CC2)C#N'


Processed molecule 1700/182680 in molecule_fingerprints_part_6.parquet


[08:36:24] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5
[08:36:55] Can't kekulize mol.  Unkekulized atoms: 2 3 4 7 8 12 13


Processed molecule 1800/182680 in molecule_fingerprints_part_6.parquet


[08:37:19] SMILES Parse Error: unclosed ring for input: 'CN1C=C(C=N1)C2=NC3=C(C=NN3C)C4CC(C4)(CC#N)CC#N'


Processed molecule 1900/182680 in molecule_fingerprints_part_6.parquet
Processed molecule 2000/182680 in molecule_fingerprints_part_6.parquet
Processed molecule 2100/182680 in molecule_fingerprints_part_6.parquet


[08:39:04] Can't kekulize mol.  Unkekulized atoms: 0 1 2 5 7 8 9 10 21 22


Processed molecule 2200/182680 in molecule_fingerprints_part_6.parquet


[08:39:55] Can't kekulize mol.  Unkekulized atoms: 25 26 29


Processed molecule 2300/182680 in molecule_fingerprints_part_6.parquet
Processed molecule 2400/182680 in molecule_fingerprints_part_6.parquet


[08:41:06] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8 9 20 21 22 23 24


Processed molecule 2500/182680 in molecule_fingerprints_part_6.parquet


[08:41:14] SMILES Parse Error: unclosed ring for input: 'CC1=CC=C(C=C1)N2C(=C3C(=N2)C=C4C(=N2)C=C(OC4=O)C)C'


Processed molecule 2600/182680 in molecule_fingerprints_part_6.parquet
Processed molecule 2700/182680 in molecule_fingerprints_part_6.parquet
Processed molecule 2800/182680 in molecule_fingerprints_part_6.parquet
Processed molecule 2900/182680 in molecule_fingerprints_part_6.parquet
Processed molecule 3000/182680 in molecule_fingerprints_part_6.parquet


In [None]:
# Combine all processed files
'''
processed_files = [f for f in os.listdir(OUTPUT_DIR) if f.startswith('processed_')]
combined_df = pd.concat([pd.read_parquet(os.path.join(OUTPUT_DIR, f)) for f in processed_files], ignore_index=True)
'''

In [None]:
# Save the final combined file
#combined_df.to_parquet(FINAL_OUTPUT, index=False)