In [2]:
import numpy as np
import pandas as pd
import os
import random
import json
from typing import List
import re

from rdkit import Chem
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

In [3]:
seed = 0
random.seed(seed)
np.random.seed(seed)

def load_json(file):
    with open(file, 'r') as f:
        return json.load(f)

def save_json(file, data):
    with open(file, 'w') as f:
        json.dump(data, f)

In [4]:
# Function to canonicalize SMILES
def canonicalize_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        return Chem.MolToSmiles(mol, canonical=True)
    except:
        pass  # Handle invalid SMILES strings

In [5]:
def extract_output(output_file, model_id) -> List[str]:
    """
    Extracts the list of SMILES strings from the json response output of LLMs
    :param output_file: str, path to the json file containing the output
    :param model_id: str, the model id of the LLM used to differentiate the json object structure
    """
    with open(output_file, "r") as file:
        data = json.load(file)

    # Updated SMILES regex pattern to be more robust
    smiles_pattern = r"<SMILES>\s*((?:[A-Za-z0-9@+\-\[\]\(\)\\\/%=#\$:.]+))\s*</SMILES>"
    optimized_smiles = []
    matches = None

    for i, entry in enumerate(data):
        if model_id in ['llasmol']:
            response_text = " ".join(entry.get("output", [""]))
        elif model_id in ['claude']:
            response_text = entry.get("output", "")
        elif model_id in ['mistral', 'chemllm', 'llama']:
            response_tag = "[/INST]\n%%% Response:"
            responses = entry.get("response", [])
            # by mistake we put the prompt as well in the response, so we have input smiles as well
            # so we need to extract the output smiles only from the response
            if response_tag in responses[0]:
                response_text = []
                for response in responses:
                    if response_tag in response:
                        response_text.append(response.split(response_tag)[1].split('<<SYS>>')[0])
                response_text = " ".join(response_text)
                #print(i, response_text)
            else:
                response_text = " ".join(responses)
        else:
            response_text = entry

        matches = re.findall(smiles_pattern, response_text)

        if matches:
            # Use the last match in case there are multiple SMILES strings
            opts = set()
            for match in matches:
                opts.add(match.strip())
            optimized_smiles.append(' '.join(list(opts)))
            #print(i, opts)
        else:
            optimized_smiles.append("None")

    return optimized_smiles

In [6]:
def process_output_llms(raw_output_path, output_path, model_id):
    """
    Process the output of LLMs to extract the SMILES strings
    :param raw_output_path: str, path to the raw output json file
    :param output_path: str, path to save the extracted SMILES strings
    :param model_id: str, the model id of the LLM used to differentiate the json format
        Accepted values: ['llasmol', 'claude', 'mistral', 'chemllm', 'llama']
    """
    # for llms, we need to extract the SMILES from json file and return all the SMILES
    optimized_smiles = extract_output(raw_output_path, model_id)

    output_smiles = []
    for i in range(len(optimized_smiles)):
        tmp_smiles = optimized_smiles[i].split()
        opt_smiles = set()
        for smile in tmp_smiles:
            # first check if the smile is valid
            canonical_smile = None
            try:
                canonical_smile = Chem.MolToSmiles(Chem.MolFromSmiles(smile))
                opt_smiles.add(canonical_smile)
            except:
                pass
        output_smiles.append(','.join(opt_smiles))
    
    # save the list of SMILES as a txt file
    np.savetxt(output_path, output_smiles, fmt='%s', comments='')

In [7]:
from admet_ai import ADMETModel
model = ADMETModel(num_workers=4)

  vars(torch.load(path, map_location=lambda storage, loc: storage)["args"]),
  state = torch.load(path, map_location=lambda storage, loc: storage)


Loading pretrained parameter "encoder.encoder.0.cached_zero_vector".
Loading pretrained parameter "encoder.encoder.0.W_i.weight".
Loading pretrained parameter "encoder.encoder.0.W_h.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.bias".
Loading pretrained parameter "readout.1.weight".
Loading pretrained parameter "readout.1.bias".
Loading pretrained parameter "readout.4.weight".
Loading pretrained parameter "readout.4.bias".
Moving model to cuda
Loading pretrained parameter "encoder.encoder.0.cached_zero_vector".
Loading pretrained parameter "encoder.encoder.0.W_i.weight".
Loading pretrained parameter "encoder.encoder.0.W_h.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.bias".
Loading pretrained parameter "readout.1.weight".
Loading pretrained parameter "readout.1.bias".
Loading pretrained parameter "readout.4.weight".
Loading pretrained p

  state = torch.load(path, map_location=lambda storage, loc: storage)


Loading pretrained parameter "encoder.encoder.0.cached_zero_vector".
Loading pretrained parameter "encoder.encoder.0.W_i.weight".
Loading pretrained parameter "encoder.encoder.0.W_h.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.bias".
Loading pretrained parameter "readout.1.weight".
Loading pretrained parameter "readout.1.bias".
Loading pretrained parameter "readout.4.weight".
Loading pretrained parameter "readout.4.bias".
Moving model to cuda
Loading pretrained parameter "encoder.encoder.0.cached_zero_vector".
Loading pretrained parameter "encoder.encoder.0.W_i.weight".
Loading pretrained parameter "encoder.encoder.0.W_h.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.bias".
Loading pretrained parameter "readout.1.weight".
Loading pretrained parameter "readout.1.bias".
Loading pretrained parameter "readout.4.weight".
Loading pretrained p

In [8]:
def generate_props(smiles_path, props_path):
    """
    Generates properties for the preprocessed SMILES files using the ADMET model
    :param smiles_path: str, path to the preprocessed SMILES file
    :param output_path: str, path to save the properties as a csv file
    """

    # load the preprocessed SMILES
    smiles = []
    with open(smiles_path, 'r') as f:
        for line in f:
            smiles += line.strip().split(',')
    # remove duplicates
    smiles = set(smiles)
    print(f"Number of preprocessed SMILES: {len(smiles)}")
    smiles = [canonicalize_smiles(smile) for smile in smiles]
    #print(len(smiles))
    smiles = [smile for smile in smiles if smile is not None]
    print(f"Number of postprocessed SMILES: {len(smiles)}")
    # compute properties
    props = model.predict(smiles)
    # save the props as a csv
    props_df = pd.DataFrame(props)
    props_df.reset_index(inplace=True)
    # rename BBB_Martins to BBBP
    props_df.rename(columns={'index': 'smiles', 'BBB_Martins': 'bbbp', 'AMES': 'mutagenicity', 'HIA_Hou': 'hia'}, inplace=True)
    props_df = props_df[['smiles', 'bbbp', 'mutagenicity', 'hia']].round(2)
    props_df.to_csv(props_path, index=False)

In [None]:
# Use `model_id` = 'mistral' or 'llama' for all GeLLMO models using Mistral or LLama as the base LLM, respectively.
# Use `model_id` = 'llasmol' or 'claude' for outputs of LLaSMoL or Claude, respectively.
process_output_llms("test/bbbp+drd2+plogp+qed_response.json", "test/bbbp+drd2+plogp+qed-smiles.csv", "mistral")
generate_props("test/bbbp+drd2+plogp+qed-smiles.csv", "test/bbbp+drd2+plogp+qed-admet_props.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'test/bbbp+drd2+plogp+qed-smiles.csv'