# Project: Decoding Molecules From Fingerprints.
## Group Members:
### Qi Chen, e-mail: gusqichr@student.gu.se
### Nils Dunlop, e-mail: gusdunlni@student.gu.se
### Francisco Alejandro Erazo Piza, e-mail: guserafr@student.gu.se
***

In [115]:
import os
import pandas as pd
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed
from rdkit import Chem

### Define directories
***

In [116]:
# Set user directory
USER_DIR = os.path.expanduser('~')

# Set the project root directory
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Define the input and output directories
INPUT_DIR = os.path.join(PROJECT_ROOT, 'data/chunks')
OUTPUT_DIR = os.path.join(PROJECT_ROOT, 'data/processed_chunks')
FINAL_OUTPUT = os.path.join(PROJECT_ROOT, 'data/final_processed_data.parquet')

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Path to the conda environment to MolForge
CONDA_ENV_PATH = os.path.join(USER_DIR, 'conda/envs/molforge/bin/python')
MOLFORGE_DIR = os.path.join(USER_DIR, 'MolForge')
MOLFORGE_SCRIPT_PATH = os.path.join(USER_DIR, 'MolForge/predict.py')

# Define the number of workers
WORKERS = 16

In [117]:
print(PROJECT_ROOT)
print(INPUT_DIR)
print(OUTPUT_DIR)
print(FINAL_OUTPUT)
print(CONDA_ENV_PATH)
print(MOLFORGE_DIR)
print(MOLFORGE_SCRIPT_PATH)


/home/javhittaxx/Fingerprints-To-Smiles
/home/javhittaxx/Fingerprints-To-Smiles/data/chunks
/home/javhittaxx/Fingerprints-To-Smiles/data/processed_chunks
/home/javhittaxx/Fingerprints-To-Smiles/data/final_processed_data.parquet
/home/javhittaxx/conda/envs/molforge/bin/python
/home/javhittaxx/MolForge
/home/javhittaxx/MolForge/predict.py


### Check parquet files
***

In [118]:
"""
for file in os.listdir(INPUT_DIR):
    if file.endswith('.parquet'):
        df = pd.read_parquet(os.path.join(INPUT_DIR, file))
        print(f"{file} shape: {df.shape}")
"""


'\nfor file in os.listdir(INPUT_DIR):\n    if file.endswith(\'.parquet\'):\n        df = pd.read_parquet(os.path.join(INPUT_DIR, file))\n        print(f"{file} shape: {df.shape}")\n'

In [119]:
parquet_files = ['molecule_fingerprints_part_0.parquet']

#### Compute MolForge Predictions
***


In [120]:
def convert_spaced_smiles(spaced_smiles):
    compact_smiles = spaced_smiles.replace(" ", "")
    mol = Chem.MolFromSmiles(compact_smiles)
    if mol is not None:
        return Chem.MolToSmiles(mol, isomericSmiles=False)
    else:
        return "Invalid SMILES string"

In [121]:
def run_molforge(fp_input):
    # Construct the command
    command = [
        CONDA_ENV_PATH,
        MOLFORGE_SCRIPT_PATH,
        "--fp=ECFP4",
        "--model_type=smiles",
        f"--input={fp_input}"
    ]
    
    # Run the command and capture output
    try:
        result = subprocess.run(
            command,
            capture_output=True,
            text=True,
            check=True,
            cwd=MOLFORGE_DIR,
            env=os.environ.copy()
        )
    except subprocess.CalledProcessError as e:
        print(f"Error running MolForge: {e}")
        print(f"Error output: {e.stderr}")
        return "Error", "Error"
    
    # Process the result
    spaced_smiles = None
    device_used = None
    output_lines = result.stdout.splitlines()
    
    for line in output_lines:
        if "Result:" in line:
            spaced_smiles = line.split("Result:")[1].strip()
        if "rank :" in line:
            device_used = line.split("rank :")[1].strip()
    
    # Convert the spaced SMILES to compact SMILES
    if spaced_smiles:
        compact_smiles = convert_spaced_smiles(spaced_smiles)
    else:
        compact_smiles = "No Result"

    return compact_smiles, device_used

In [122]:
def process_row(row):
    fp_input = row['SparseFingerprintBits']
    
    # Run MolForge and get the converted SMILES and device information
    compact_smiles, device_used = run_molforge(fp_input)
    
    return compact_smiles, device_used

In [123]:
def process_chunk(file_path, max_rows=None):
    chunk_df = pd.read_parquet(file_path)
    if max_rows:
        chunk_df = chunk_df.head(max_rows)
    
    total = len(chunk_df)
    with ThreadPoolExecutor(max_workers=WORKERS) as executor:
        futures = {executor.submit(run_molforge, row['SparseFingerprintBits']): idx for idx, row in chunk_df.iterrows()}
        
        for i, future in enumerate(as_completed(futures)):
            idx = futures[future]
            result, device_used = future.result()
            chunk_df.at[idx, 'Converted_SMILES'] = result
            chunk_df.at[idx, 'Device_Used'] = device_used
            if (i + 1) % 100 == 0 or (i + 1) == total:
                print(f"Processed molecule {i + 1}/{total} in {os.path.basename(file_path)}")
    
    output_file = os.path.join(OUTPUT_DIR, f"processed_{os.path.basename(file_path)}")
    chunk_df.to_parquet(output_file, index=False)
    print(f"Finished processing {os.path.basename(file_path)}")

In [124]:
def process_files(file_list=None, max_rows=None):
    if file_list:
        chunk_files = [f for f in file_list if f in os.listdir(INPUT_DIR) and f.endswith('.parquet')]
    else:
        chunk_files = [f for f in os.listdir(INPUT_DIR) if f.endswith('.parquet')]
    
    total_chunks = len(chunk_files)
    for i, file in enumerate(chunk_files, 1):
        file_path = os.path.join(INPUT_DIR, file)
        process_chunk(file_path, max_rows)


In [125]:
process_files(file_list=parquet_files, max_rows=1000)

[21:22:41] SMILES Parse Error: unclosed ring for input: 'CN1C(CC2=C3C14CCN(C(C3(CC2)O)CC(=O)NC5=CC=CC=C5)CC6CC6)CC(=O)NC7=CC=CC=C7'


Processed molecule 100/1000 in molecule_fingerprints_part_0.parquet
Processed molecule 200/1000 in molecule_fingerprints_part_0.parquet


[21:23:31] SMILES Parse Error: unclosed ring for input: 'C1CCC2=NC3=C(C=C(C=C3)NC(=O)CCCCCCCCCCCCCCCCCCCCCCCCCC2)C(=C2C1)N'


Processed molecule 300/1000 in molecule_fingerprints_part_0.parquet
Processed molecule 400/1000 in molecule_fingerprints_part_0.parquet
Processed molecule 500/1000 in molecule_fingerprints_part_0.parquet
Processed molecule 600/1000 in molecule_fingerprints_part_0.parquet
Processed molecule 700/1000 in molecule_fingerprints_part_0.parquet


[21:25:32] SMILES Parse Error: unclosed ring for input: 'C1C2CC3CC1CC(C2)(C3)C4=CC=CC=C4C5C(C6=CC=CC=C6C57CC8CC(C7)CC(C8)C7)N'


Processed molecule 800/1000 in molecule_fingerprints_part_0.parquet
Processed molecule 900/1000 in molecule_fingerprints_part_0.parquet


[21:26:13] SMILES Parse Error: unclosed ring for input: 'CN1CCC2=CC3=C(C4=C2C1CC5=C4C6=C(C=C5)CC7C8=C(C=CC9=C8OCO9)CCN7C)OCO3'
[21:26:14] SMILES Parse Error: unclosed ring for input: 'C123C4(C5(C(O2)(CC(C1(C(/C(=N/OC)/C3(C(C(C4=O)(C(C)C)O)O)C)OC(=O)c1[nH]ccc1)O)(C4(O)C)C)O)C)O'


Processed molecule 1000/1000 in molecule_fingerprints_part_0.parquet
Finished processing molecule_fingerprints_part_0.parquet
