Processes a vector of smiles strings into xyz file

In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem

def smiles_to_xyz(smiles_list):
    xyz_list = []
    
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            raise ValueError(f"Invalid SMILES string: {smiles}")

        # Add hydrogens
        mol = Chem.AddHs(mol)

        # Generate 3D coordinates
        AllChem.EmbedMolecule(mol, AllChem.ETKDG())
        if mol.GetNumConformers() == 0:
            raise ValueError(f"Unable to generate 3D coordinates for SMILES: {smiles}")

        # Get XYZ string representation
        xyz_string = Chem.MolToXYZBlock(mol, confId=0)

        # Add " 0.0000" after the number of atoms line with a space before
        xyz_string = xyz_string.split('\n', 1)
        xyz_string = xyz_string[0] + "\n0.0000" + xyz_string[1]

        xyz_list.append(xyz_string)
    return xyz_list

def save_xyz_file(xyz_list, file_path):
    with open(file_path, 'w') as f:
        f.write(''.join(xyz_list))  # No separator between XYZ strings



In [12]:
import pandas as pd 

data_filepath = '../../data/embs/embslogs/smileslogs-prunedsmiles2xyzFAIL-.csv'
data = pd.read_csv(data_filepath,delimiter=',')
smiles = data.iloc[:1320,1]
# Example usage
smiles_list = smiles.tolist()

try:
    xyz_list = smiles_to_xyz(smiles_list)

    # Save all XYZ strings into one file without space between molecules
    save_xyz_file(xyz_list, "../../data/embs/embslogs/logSmols.xyz")
    print("XYZ strings saved to 'output.xyz' file.")
except ValueError as e:
    print(e)


XYZ strings saved to 'output.xyz' file.


Process melting point strings to array/series and save

In [None]:

data_filepath = '../data/datasets/embsMP/smiles-mps.csv'
data = pd.read_csv(data_filepath,delimiter=',')
mps = data.iloc[:1000,1]
mps = mps.tolist()

In [None]:
import re

def extract_first_number(s):
    # Use regular expression to find the first number in the string
    match = re.search(r'[-+]?\d*\.\d+|\d+', s)
    if match:
        return int(float(match.group()))
    else:
        return None

# Example list of strings
strings_list =  mps

# Extract the first number from each string and store in an array
numbers_array = [extract_first_number(s) for s in strings_list]

# Save the numbers array to a file
output_file = "../data/datasets/embsMP/mps.csv"
with open(output_file, 'w') as f:
    for num in numbers_array:
        f.write(str(num) + ',\n')

print("Numbers saved to 'numbers.txt' file.")


Process melting points to a vector of number targets, y