# Project: Decoding Molecules From Fingerprints.
## Group Members:
### Qi Chen, e-mail: gusqichr@student.gu.se
### Nils Dunlop, e-mail: gusdunlni@student.gu.se
### Francisco Alejandro Erazo Piza, e-mail: guserafr@student.gu.se
***

In [1]:
import zipfile
import os
import pandas as pd
import numpy as np
from rdkit.Chem import PandasTools
from rdkit.Chem import rdFingerprintGenerator
from rdkit import Chem
from joblib import Parallel, delayed

#### Convert SMILES to Fingerprints
***

In [2]:
# Set the project root directory
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add the project root to the Python path
import sys
sys.path.append(PROJECT_ROOT)

# Define the output directory for the chunks
SAVE_DIR = os.path.join(PROJECT_ROOT, 'data/chunks')

# Ensure the save directory exists
os.makedirs(SAVE_DIR, exist_ok=True)

# Define the zip file path
ZIP_FILE_PATH = os.path.join(PROJECT_ROOT, 'data/final_chembl.zip')

# Define the combined parquet file path
COMBINED_PARQUET_PATH = os.path.join(PROJECT_ROOT, 'data/combined_molecule_fingerprints.parquet')

# Cores
NUM_CORES = 16

In [3]:
def import_data_in_chunks(zip_file_path, chunksize=10000):
    """
    Import data from a zip file containing a CSV in chunks.
    """
    with zipfile.ZipFile(zip_file_path, 'r') as z:
        file_list = z.namelist()
        csv_filename = file_list[0]
        with z.open(csv_filename) as f:
            for chunk in pd.read_csv(f, chunksize=chunksize):
                yield chunk

In [4]:
def prepare_smiles_df(df_chunk):
    """
    Prepare a DataFrame with ChEMBL ID, SMILES, and Molecule column.
    """
    smiles_df = df_chunk[['ChEMBL ID', 'smiles']].copy()
    PandasTools.AddMoleculeColumnToFrame(smiles_df, 'smiles', 'Molecule')
    return smiles_df

In [5]:
def generate_fingerprint(mol):
    """
    Generate ECFP4 fingerprint and sparse representation for a molecule.
    """
    if mol is not None:
        morgan_generator = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
        fp = morgan_generator.GetFingerprint(mol)
        bit_vector = np.array(list(fp.ToBitString())).astype(int)
        sparse_representation = list(np.where(bit_vector == 1)[0])
        sparse_representation_str = ' '.join(map(str, sparse_representation))
        return bit_vector, sparse_representation_str
    return None, None

In [6]:
def generate_fingerprints_parallel(smiles_df):
    """
    Generate fingerprints for all molecules in parallel.
    """

    results = Parallel(n_jobs=NUM_CORES)(delayed(generate_fingerprint)(mol) for mol in smiles_df['Molecule'])
    bit_vectors, sparse_fingerprints = zip(*results)
    smiles_df['FingerprintBits'] = list(bit_vectors)
    smiles_df['SparseFingerprintBits'] = list(sparse_fingerprints)
    return smiles_df

In [7]:
def process_chunk(df_chunk, chunk_id):
    """
    Process a single chunk of data.
    """
    smiles_df = prepare_smiles_df(df_chunk)
    smiles_df = generate_fingerprints_parallel(smiles_df)
    smiles_df_filtered = smiles_df.drop(columns=['Molecule'])
    file_name = os.path.join(SAVE_DIR, f'molecule_fingerprints_part_{chunk_id}.parquet')
    smiles_df_filtered.to_parquet(file_name, compression='snappy')
    print(f"Processed and saved chunk {chunk_id}")

In [8]:
chunksize = 50000
    
# Process data in chunks
for chunk_id, df_chunk in enumerate(import_data_in_chunks(ZIP_FILE_PATH, chunksize)):
    process_chunk(df_chunk, chunk_id)

print("All chunks processed and saved as individual parquet files.")

Processed and saved chunk 0
Processed and saved chunk 1
Processed and saved chunk 2
Processed and saved chunk 3
Processed and saved chunk 4
Processed and saved chunk 5
Processed and saved chunk 6
Processed and saved chunk 7
Processed and saved chunk 8
Processed and saved chunk 9
Processed and saved chunk 10
Processed and saved chunk 11
Processed and saved chunk 12
Processed and saved chunk 13
Processed and saved chunk 14
Processed and saved chunk 15
Processed and saved chunk 16
Processed and saved chunk 17
Processed and saved chunk 18
Processed and saved chunk 19
Processed and saved chunk 20
Processed and saved chunk 21
Processed and saved chunk 22
Processed and saved chunk 23
Processed and saved chunk 24
Processed and saved chunk 25
Processed and saved chunk 26
Processed and saved chunk 27
All chunks processed and saved as individual parquet files.
