In [None]:
! pip install requests pandas

# **Prepare Your List of PubChem CIDs**
Store your PubChem IDs in a text file **(CIDs.txt)** or a CSV file:

12345

67890

... (300 CIDs)

In [None]:
import requests
import pandas as pd
from time import sleep

def fetch_smiles_from_pubchem(cid):
    """Fetch SMILES for a given PubChem CID using PUG REST API."""
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/CanonicalSMILES/JSON"
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            data = response.json()
            smiles = data['PropertyTable']['Properties'][0]['CanonicalSMILES']
            return smiles
        else:
            print(f"Error fetching CID {cid}: HTTP {response.status_code}")
            return None
    except Exception as e:
        print(f"Failed for CID {cid}: {str(e)}")
        return None

def main():
    # Read CIDs from a file (one CID per line)
    with open("CIDs.txt", "r") as f:
        cids = [line.strip() for line in f if line.strip()]

    # Fetch SMILES for each CID
    results = []
    for cid in cids:
        smiles = fetch_smiles_from_pubchem(cid)
        results.append({"CID": cid, "SMILES": smiles})
        sleep(0.2)  # Avoid overwhelming PubChem's server

    # Save to CSV
    df = pd.DataFrame(results)
    df.to_csv("pubchem_smiles.csv", index=False)
    print(f"Saved {len(results)} SMILES to 'pubchem_smiles.csv'")

if __name__ == "__main__":
    main()

Saved 3 SMILES to 'pubchem_smiles.csv'


In [2]:
! pip install RDKit openmm

Collecting RDKit
  Downloading rdkit-2025.3.2-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Collecting openmm
  Downloading OpenMM-8.2.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (844 bytes)
Downloading rdkit-2025.3.2-cp311-cp311-manylinux_2_28_x86_64.whl (35.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.2/35.2 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading OpenMM-8.2.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (12.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m69.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: RDKit, openmm
Successfully installed RDKit-2025.3.2 openmm-8.2.0


In [8]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem

# Load the SMILES strings from the input.csv file
df = pd.read_csv('pubchem_smiles.csv')
#df = df.drop([10, 13, 28]) # Drops the first three rows
smiles_list = df['SMILES'].tolist()

df


Unnamed: 0,CID,SMILES
0,65576,CC1CCC2(C(C3C(O2)CC4C3(CCC5C4CCC6C5(CCC(C6)O)C...
1,10621,CC1C(C(C(C(O1)OCC2C(C(C(C(O2)OC3=CC(=C4C(=O)CC...
2,135398735,CC1C=CC=C(C(=O)NC2=C(C(=C3C(=C2O)C(=C(C4=C3C(=...


In [9]:
mols = []
for smiles in smiles_list:
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        mols.append(mol)
    else:
        print(f'Could not create molecule from SMILES string: {smiles}')

In [10]:
from rdkit import Chem
from rdkit.Chem import AllChem

# Initialize a list to store the indices of failed conversions
failed_indices = []

# Convert the SMILES strings to RDKit molecules
mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]

# Iterate through the molecules
for i, mol in enumerate(mols):
    try:
        # Add explicit hydrogen atoms to the molecule
        mol = Chem.AddHs(mol)

        # Minimize the structure of the molecule with hydrogen atoms
        AllChem.EmbedMolecule(mol)
        AllChem.UFFOptimizeMolecule(mol)

        # Save the minimized structure as a PDB file
        Chem.MolToPDBFile(mol, f'input_{i}.pdb')
    except Exception as e:
        # Handle the exception by reporting the index and moving to the next conversion
        print(f"Failed to convert structure at index {i}: {str(e)}")
        failed_indices.append(i)

# At this point, the "failed_indices" list contains the indices of failed conversions
# You can use this list for further analysis or to identify problematic SMILES strings.


In [11]:
from rdkit import Chem
from rdkit.Chem import AllChem

# Initialize a list to store the indices of failed conversions
failed_indices = []

# Convert the SMILES strings to RDKit molecules
mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]

# Iterate through the molecules
for i, mol in enumerate(mols):
    try:
        # Add explicit hydrogen atoms to the molecule
        mol = Chem.AddHs(mol)

        # Minimize the structure of the molecule with hydrogen atoms
        AllChem.EmbedMolecule(mol)
        AllChem.UFFOptimizeMolecule(mol)

        # Save the minimized structure as a PDB file
        output_filename = f'input_{i}.pdb'
        Chem.MolToPDBFile(mol, output_filename)
    except Exception as e:
        # Handle the exception by reporting the index
        print(f"Failed to convert structure at index {i}: {str(e)}")
        failed_indices.append(i)

# At this point, the "failed_indices" list contains the indices of failed conversions
# You can use this list for further analysis or to identify problematic SMILES strings.


In [12]:
import os
import zipfile

# Zip all the existing minimized PDB files
with zipfile.ZipFile('minimized_structures.zip', 'w') as myzip:
    for i in range(len(mols)):
        pdb_file = f'input_{i}.pdb'
        if os.path.exists(pdb_file):
            myzip.write(pdb_file)
        else:
            print(f"Warning: File {pdb_file} not found, skipping.")
