In [2]:
pdb_path=r'D:\PythonProj\Auto-EC\pdb_files\1FT5.pdb'

In [1]:
from Bio.PDB import PDBParser
import numpy as np
from collections import defaultdict
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from IPython.display import display
import py3Dmol
from rdkit.Chem import AllChem

# Define molecules to exclude (water, ions, and other small molecules)
exclude_list = ['HOH', 'WAT', 'H2O', 'O2', 'CL', 'NA', 'MG', 'CA', 'K', 'ZN', 'FE', 'CU', 'MN', 'CO', 'NI', 'CD', 'SO4', 'PO4']

# Parse the PDB file
parser = PDBParser(QUIET=True)
structure = parser.get_structure('protein', pdb_path)

# Dictionary to store ligands with their atom counts
ligands = defaultdict(list)

# Extract all hetero atoms (ligands)
for model in structure:
    for chain in model:
        for residue in chain:
            # Check if it's a hetero residue (ligand) and not in exclude list
            if residue.id[0].startswith('H_') or residue.id[0] == 'W':
                resname = residue.get_resname()
                if resname not in exclude_list:
                    # Count atoms in this ligand
                    atom_count = len(list(residue.get_atoms()))
                    
                    # Store relevant information
                    ligands[resname].append({
                        'residue_id': residue.id,
                        'chain_id': chain.id,
                        'atom_count': atom_count,
                        'residue': residue
                    })

# Sort ligands by size (atom count)
sorted_ligands = {}
for resname, instances in ligands.items():
    # Sort instances of this ligand by atom count
    sorted_instances = sorted(instances, key=lambda x: x['atom_count'], reverse=True)
    sorted_ligands[resname] = sorted_instances

# Display results
print(f"Found {len(sorted_ligands)} unique ligand types")
for resname, instances in sorted_ligands.items():
    total_atoms = sum(inst['atom_count'] for inst in instances)
    print(f"Ligand {resname}: {len(instances)} instances, total {total_atoms} atoms")
    for i, inst in enumerate(instances):
        print(f"  Instance {i+1}: Chain {inst['chain_id']}, ID {inst['residue_id']}, {inst['atom_count']} atoms")
        

NameError: name 'pdb_path' is not defined

In [None]:
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from rdkit.Chem.AtomPairs import Pairs
# Function to convert a selected ligand to an RDKit molecule
def get_ligand_as_rdkit_mol(sorted_ligands,ligand_name, instance_idx=0):
    """
    Convert a ligand instance to an RDKit molecule
    
    Parameters:
    -----------
    sorted_ligands : dict
        Dictionary of ligands sorted by ssize
    ligand_name : str
        The residue name of the ligand
    instance_idx : int
        Index of the ligand instance (default: 0 for the first instance)
        
    Returns:
    --------
    RDKit Mol object
    """
    
    # Get the specified ligand instance
    if ligand_name not in sorted_ligands:
        print(f"Ligand {ligand_name} not found")
        return None
    
    if instance_idx >= len(sorted_ligands[ligand_name]):
        print(f"Ligand {ligand_name} has only {len(sorted_ligands[ligand_name])} instances, requested index {instance_idx}")
        return None
    
    ligand_residue = sorted_ligands[ligand_name][instance_idx]['residue']
    
    # Create an empty editable molecule
    mol = Chem.RWMol()
    
    # Dictionary to map PDB atom index to RDKit atom index
    atom_mapping = {}
    
    # Add atoms to molecule
    for atom in ligand_residue.get_atoms():
        element = atom.element
        
        # Handle the case where element might be two letters or has numbers
        if len(element) > 1:
            element = ''.join([c for c in element if c.isalpha()])
            if len(element) > 0:
                element = element[0].upper() + element[1:].lower()
        
        # Add atom to the molecule
        rd_atom = Chem.Atom(element)
        rd_idx = mol.AddAtom(rd_atom)
        atom_mapping[atom.get_serial_number()] = rd_idx
    
    # Guess bonds based on distance
    coord_list = []
    for atom in ligand_residue.get_atoms():
        coord_list.append((atom.get_serial_number(), atom.get_vector()))
    
    # Use distance-based approach to infer bonds
    for i, (serial_i, coord_i) in enumerate(coord_list):
        for j, (serial_j, coord_j) in enumerate(coord_list):
            if i >= j:
                continue
                
            # Calculate distance between atoms
            distance = (coord_i - coord_j).norm()
            
            # Typical bond length is around 1.5 Angstroms
            if distance < 2.0:  # Generous threshold for bond detection
                mol.AddBond(atom_mapping[serial_i], atom_mapping[serial_j], Chem.BondType.SINGLE)
    
    # Convert to mol
    mol = mol.GetMol()
    
    # Cleanup and assign bond orders
    try:
        Chem.SanitizeMol(mol)
    except:
        print("Warning: Molecule sanitization failed. Bond orders may be incorrect.")
    
    # Try to assign 3D coordinates
    conf = Chem.Conformer(mol.GetNumAtoms())
    for atom in ligand_residue.get_atoms():
        rd_idx = atom_mapping[atom.get_serial_number()]
        pos = atom.get_coord()
        pos = Chem.rdGeometry.Point3D(float(pos[0]),float( pos[1]),float( pos[2]))
        conf.SetAtomPosition(rd_idx, pos)
    mol.AddConformer(conf)
    
    return mol

def CalculateFingerPrint(mol, verbose=False):
    if (verbose):
        print(f"Successfully created RDKit molecule with {mol.GetNumAtoms()} atoms and {mol.GetNumBonds()} bonds")
        try:
            # 2D representation
            print("2D Structure:")
            display(Draw.MolToImage(mol))
        except ImportError:
            print("Visualization packages not available")
                

        # Calculate properties for the molecule
        print("Computing molecular properties and fingerprints...")

    # Calculate properties
    # Add explicit hydrogens for better property calculation
    mol_h = Chem.AddHs(mol)
    Chem.SanitizeMol(mol_h)
    try:
        AllChem.ComputeGasteigerCharges(mol_h)
    except:
        print("Could not calculate Gasteiger charges")
            

    # Get atom properties
    properties = {}
    for atom in mol_h.GetAtoms():
        atom_idx = atom.GetIdx()
        atom_symbol = atom.GetSymbol()
        
        # Basic properties
        properties[atom_idx] = {
            'atomic_num': atom.GetAtomicNum(),
            'is_aromatic':1 if atom.GetIsAromatic() else 0,
        }

        # Combine formal charge and Gasteiger charge into a single formal number
        formal_charge = atom.GetFormalCharge()
        gasteiger_charge = atom.GetProp('_GasteigerCharge') if atom.HasProp('_GasteigerCharge') else np.nan
        
        if gasteiger_charge=='-nan' or np.isnan(gasteiger_charge):
            gasteiger_charge = 0.0
        combined_charge = formal_charge + gasteiger_charge
        properties[atom_idx]['combined_charge'] = combined_charge
        
        # H-bond acceptor/donor potential as a single score
        h_acceptor_score = 0
        h_donor_score = 0

        if atom_symbol in ['N', 'O', 'F', 'S', 'P']:
            h_acceptor_score = 1  # Likely to accept hydrogen bonds
            h_count = atom.GetTotalNumHs()
            if h_count > 0:
                h_donor_score = h_count  # Likely to donate hydrogen bonds

        # Combine acceptor and donor scores into a single number
        properties[atom_idx]['h_bond_potential'] = h_acceptor_score + h_donor_score
        
        coord_value = 0

        # Example simple lookup tables for donor and metal ions
        donor_strength = {'N': 3, 'O': 4, 'S': 2, 'P': 1}
        metal_ion_strength = {'Fe': -5, 'Zn': -4, 'Cu': -4, 'Mn': -4, 'Co': -4,
                            'Ni': -4, 'Mg': -3, 'Ca': -3, 'K': -1, 'Na': -1, 'Cd': -3}

        if atom_symbol in donor_strength:
            coord_value = donor_strength[atom_symbol]
        elif atom_symbol in metal_ion_strength:
            coord_value = metal_ion_strength[atom_symbol]

        properties[atom_idx]['metal_coordination'] = coord_value
    return properties

  
 # Get the first ligand name
first_ligand_name = list(sorted_ligands.keys())[0]
print(f"Converting ligand {first_ligand_name} to RDKit molecule...")
mol = get_ligand_as_rdkit_mol(sorted_ligands, first_ligand_name, 0)

# Calculate properties and fingerprints
properties = CalculateFingerPrint(mol, verbose=False)

def create_3d_density_grid(mol, properties, grid_size=(64, 64, 64), sigma=1.0):
    """
    Create a 3D density grid for the molecule based on atomic properties.

    Parameters:
    -----------
    mol : RDKit Mol object
        The molecule for which the density grid is created.
    properties : dict
        Dictionary containing atomic properties.
    grid_size : tuple
        Fixed size of the grid (x, y, z).
    sigma : float
        Standard deviation for the Gaussian halo.

    Returns:
    --------
    py3Dmol.view
        A 3Dmol.js view object displaying the density grid.
    """
    # Get the bounding box of the molecule
    conf = mol.GetConformer()
    coords = [conf.GetAtomPosition(i) for i in range(mol.GetNumAtoms())]
    min_coords = np.min(coords, axis=0)
    max_coords = np.max(coords, axis=0)

    # Create a 3D grid
    x = np.linspace(min_coords[0], max_coords[0], grid_size[0])
    y = np.linspace(min_coords[1], max_coords[1], grid_size[1])
    z = np.linspace(min_coords[2], max_coords[2], grid_size[2])
    grid = np.zeros((*grid_size, 3))  # 3 channels for chemical behavior

    # Populate the grid with Gaussian halos
    for atom in mol.GetAtoms():
        atom_idx = atom.GetIdx()
        prop = properties[atom_idx]
        pos = conf.GetAtomPosition(atom_idx)
        atomic_num = prop['atomic_num']
        h_bond_potential = prop['h_bond_potential']
        metal_coordination = prop['metal_coordination']

        # Determine the channel based on chemical behavior
        if atomic_num in [6, 1]:  # Non-polar atoms (e.g., C, H)
            channel = 0
        elif h_bond_potential > 0:  # Polar atoms (e.g., N, O)
            channel = 1
        elif metal_coordination != 0:  # Metal coordination
            channel = 2
        else:
            continue

        # Add Gaussian halo to the grid
        for i, xi in enumerate(x):
            for j, yj in enumerate(y):
                for k, zk in enumerate(z):
                    distance = np.linalg.norm([xi - pos.x, yj - pos.y, zk - pos.z])
                    grid[i, j, k, channel] += np.exp(-0.5 * (distance / sigma) ** 2)

    # Normalize the grid for visualization
    grid /= np.max(grid)

    # Create a 3Dmol.js view
    view = py3Dmol.view()
    for channel, color in enumerate(['blue', 'green', 'red']):  # Colors for channels
        iso_surface = grid[:, :, :, channel]
        iso_surface = iso_surface / np.max(iso_surface)  # Normalize
        view.addVolumetricData(iso_surface.tolist(), "cube", {"opacity": 0.5, "color": color})

    return view

# Create and display the 3D density grid
view = create_3d_density_grid(mol, properties)
view.show()


Converting ligand HEM to RDKit molecule...


In [None]:
import os
from rdkit.Chem import AllChem, PDBParser as RDKitPDBParser
from rdkit.Chem import Draw
from IPython.display import display

def save_ligand_to_pdb(sorted_ligands, ligand_name, instance_idx=0, output_path=None):
    """
    Extract a specific ligand from the PDB structure and save it as a separate PDB file
    
    Parameters:
    -----------
    sorted_ligands : dict
        Dictionary of ligands sorted by size
    ligand_name : str
        The residue name of the ligand
    instance_idx : int
        Index of the ligand instance (default: 0 for the first instance)
    output_path : str
        Path to save the extracted ligand (default: None, will use ligand_name.pdb)
        
    Returns:
    --------
    output_path : str
        Path where the ligand was saved
    rdkit_mol : RDKit Mol
        RDKit molecule with proper connectivity
    """
    if ligand_name not in sorted_ligands:
        print(f"Ligand {ligand_name} not found")
        return None, None
    
    if instance_idx >= len(sorted_ligands[ligand_name]):
        print(f"Ligand {ligand_name} has only {len(sorted_ligands[ligand_name])} instances, requested index {instance_idx}")
        return None, None
    
    # Get the ligand residue
    ligand_instance = sorted_ligands[ligand_name][instance_idx]
    ligand_residue = ligand_instance['residue']
    
    # Create output path if not provided
    if output_path is None:
        output_path = f"{ligand_name}_{instance_idx}.pdb"
    
    # Write the ligand to a PDB file
    with open(output_path, 'w') as f:
        for atom in ligand_residue.get_atoms():
            # Create a PDB-format ATOM/HETATM line
            element = atom.element.strip()
            if len(element) > 1:
                element = element[0].upper() + element[1:].lower()
            
            coords = atom.get_coord()
            line = f"HETATM{atom.get_serial_number():5d} {atom.get_name():4s} {ligand_residue.get_resname():3s} {ligand_instance['chain_id']:1s}{ligand_residue.id[1]:4d}    "
            line += f"{coords[0]:8.3f}{coords[1]:8.3f}{coords[2]:8.3f}  1.00  0.00          {element:2s}  \n"
            f.write(line)
        f.write("END\n")
    
    print(f"Ligand saved to {output_path}")
    
    # Load the ligand with RDKit
    try:
        # First try to use the PDB reader directly
        rdkit_mol = Chem.MolFromPDBFile(output_path, removeHs=False, sanitize=True)
        if rdkit_mol is None:
            # If direct reading fails, try with the better PDB parser
            rdkit_mol = Chem.MolFromPDBFile(output_path, removeHs=False, sanitize=False)
            try:
                Chem.SanitizeMol(rdkit_mol)
            except:
                print("Warning: Molecule sanitization failed. Trying alternative approach...")
                # Try SMILES perception
                smiles = Chem.MolToSmiles(rdkit_mol)
                rdkit_mol = Chem.MolFromSmiles(smiles)
                # Add back 3D coordinates
                conf = rdkit_mol.GetConformer()
                for atom_idx, atom in enumerate(ligand_residue.get_atoms()):
                    if atom_idx < rdkit_mol.GetNumAtoms():
                        pos = atom.get_coord()
                        conf.SetAtomPosition(atom_idx, Chem.rdGeometry.Point3D(float(pos[0]), float(pos[1]), float(pos[2])))
    except Exception as e:
        print(f"Error loading ligand with RDKit: {e}")
        rdkit_mol = None
    
    return output_path, rdkit_mol

# Extract the first ligand
first_ligand_name = list(sorted_ligands.keys())[0]
output_path, rdkit_mol = save_ligand_to_pdb(sorted_ligands, first_ligand_name)

# Visualize the molecule
if rdkit_mol is not None:
    print(f"Successfully loaded ligand with RDKit: {rdkit_mol.GetNumAtoms()} atoms, {rdkit_mol.GetNumBonds()} bonds")
    display(Draw.MolToImage(rdkit_mol))

    # Create a 3D view of the molecule
    view = py3Dmol.view(width=400, height=300)
    mb = Chem.MolToMolBlock(rdkit_mol)
    view.addModel(mb, "mol")
    view.setStyle({'stick': {}})
    view.zoomTo()
    view.show()
else:
    print("Failed to load ligand with RDKit")

{0: {'atomic_num': 6,
  'is_aromatic': 0,
  'combined_charge': 0.0,
  'h_bond_potential': 0,
  'metal_coordination': 0},
 1: {'atomic_num': 6,
  'is_aromatic': 0,
  'combined_charge': 0.0,
  'h_bond_potential': 0,
  'metal_coordination': 0},
 2: {'atomic_num': 6,
  'is_aromatic': 0,
  'combined_charge': 0.0,
  'h_bond_potential': 0,
  'metal_coordination': 0},
 3: {'atomic_num': 6,
  'is_aromatic': 0,
  'combined_charge': 0.0,
  'h_bond_potential': 0,
  'metal_coordination': 0},
 4: {'atomic_num': 6,
  'is_aromatic': 0,
  'combined_charge': 0.0,
  'h_bond_potential': 0,
  'metal_coordination': 0},
 5: {'atomic_num': 6,
  'is_aromatic': 0,
  'combined_charge': 0.0,
  'h_bond_potential': 0,
  'metal_coordination': 0},
 6: {'atomic_num': 6,
  'is_aromatic': 0,
  'combined_charge': 0.0,
  'h_bond_potential': 0,
  'metal_coordination': 0},
 7: {'atomic_num': 6,
  'is_aromatic': 0,
  'combined_charge': 0.0,
  'h_bond_potential': 0,
  'metal_coordination': 0},
 8: {'atomic_num': 6,
  'is_arom