Import

In [14]:
import os
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator, Draw
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.ML.Cluster import Butina
from rdkit.DataStructs import TanimotoSimilarity
from collections import Counter

Set the working directory

In [9]:
home_dir = os.path.expanduser('~')
working_dir = os.path.join(home_dir, 'Drug_design', 'find_common')
os.makedirs(working_dir, exist_ok=True)
os.chdir(working_dir)
print(f"Working directory set to: {os.getcwd()}")

Working directory set to: /home/liuxuan/Drug_design/find_common


Define SMILES srting from the 2 documents

In [None]:
# Define SMILES strings from the two documents
smiles_list1 = [
    "COCCO[C@H]1CCCN(C1)[C@@H]1C=C(C[C@@H]([C@H]1NC(=O)C(C)C)n1cc(nn1)-c1cccc(OC)c1)C([O-])=O",
    "COCCO[C@H]1CCCN(C1)[C@@H]1C=C(C[C@@H]([C@H]1NC(=O)C(C)C)n1cc(nn1)-c1ccccc1F)C([O-])=O",
    "COCCO[C@H]1CCCN(C1)[C@@H]1C=C(C[C@@H]([C@H]1NC(=O)C(C)C)n1cc(nn1)-c1cccc(F)c1)C([O-])=O",
    "COCCO[C@H]1CCCN(C1)[C@@H]1C=C(C[C@@H]([C@H]1NC(=O)C(C)C)n1cc(nn1)-c1ccccn1)C([O-])=O",
    "COCCO[C@H]1CCCN(C1)[C@@H]1C=C(C[C@@H]([C@H]1NC(=O)C(C)C)n1cc(nn1)-c1ccc(cc1)C#N)C([O-])=O",
    "CC(C)C(=O)N[C@@H]1[C@H](CC(=C[C@H]1N1CCC[C@@H](C1)OCCO)C([O-])=O)n1cc(nn1)-c1ccc(cc1)C#N",
    "CC(C)C(=O)N[C@H]1[C@@H](C=C(C[C@@H]1n1cc(nn1)-c1ccc(cc1)C#N)C([O-])=O)N1CCC[C@H](O)C1",
    "CC(C)C(=O)N[C@H]1[C@@H](C=C(C[C@@H]1n1cc(nn1)-c1ccc(F)cc1)C([O-])=O)N1CCC[C@H](O)C1",
    "COCCO[C@H]1CCCN(C1)[C@@H]1C=C(C[C@@H]([C@H]1NC(=O)C(C)C)n1cc(nn1)-c1ccc(cc1)C(C)(C)C)C([O-])=O",
    "COCCO[C@H]1CCCN(C1)[C@@H]1C=C(C[C@@H]([C@H]1NC(=O)C(C)C)n1cc(nn1)-c1ccc2OCOc2c1)C([O-])=O",
    "COCCO[C@H]1CCCN(C1)[C@@H]1C=C(C[C@@H]([C@H]1NC(=O)C(C)C)n1cc(nn1)-c1cncnc1)C([O-])=O"
]

smiles_list2 = [
    "COCCO[C@H]1CCCN(C1)[C@@H]1C=C(C[C@@H]([C@H]1NC(=O)C(C)C)n1cc(nn1)-c1ccc(OC)cc1)C(O)=O",
    "CC(C)C(=O)N[C@@H]1[C@H](CC(=C[C@H]1N1CCC[C@@H](C1)OCCO)C(O)=O)n1cc(nn1)-c1ccc(F)cc1",
    "COCCO[C@H]1CCCN(C1)[C@@H]1C=C(C[C@@H]([C@H]1NC(=O)C(C)C)n1cc(nn1)-c1ccc(cc1)C(F)(F)F)C(O)=O",
    "COCCO[C@H]1CCCN(C1)[C@@H]1C=C(C[C@H](NS(=O)(=O)C2CC2)[C@H]1NC(=O)C(C)C)C(O)=O",
    "COCCO[C@H]1CCCN(C1)[C@@H]1C=C(C[C@H](NS(=O)(=O)c2ccc(C)cc2)[C@H]1NC(=O)C(C)C)C(O)=O",
    "CC(C)C(=O)N[C@@H]1[C@H](CC(=C[C@H]1N1CCC[C@H](O)C1)C(O)=O)n1cc(nn1)-c1ccccc1",
    "CC(C)C(=O)N[C@@H]1[C@H](CC(=C[C@H]1N1CCC[C@@H](C1)OCCO)C(O)=O)n1cc(nn1)-c1ccccc1",
    "COc1ccc(cc1)-c1cn(nn1)[C@H]1CC(=C[C@H]([C@@H]1NC(=O)C(C)C)N1CCC[C@@H](C1)OCCO)C(O)=O",
    "COc1ccc(cc1)-c1cn(nn1)[C@H]1CC(=C[C@H]([C@@H]1NC(=O)C(C)C)N1CCC[C@H](O)C1)C(O)=O",
    "COCCO[C@H]1CCCN(C1)[C@@H]1C=C(C[C@@H]([C@H]1NC(=O)C(C)C)n1cc(nn1)-c1ccccc1OC)C(O)=O",
    "COCCO[C@H]1CCCN(C1)[C@@H]1C=C(C[C@@H]([C@H]1NC(=O)C(C)C)n1cc(nn1)-c1ccc(cc1)-c1ccccc1)C(O)=O",
    "COCCO[C@H]1CCCN(C1)[C@@H]1C=C(C[C@@H]([C@H]1NC(=O)C(C)C)n1cc(nn1)-c1cccnc1)C(O)=O",
    "COCCO[C@H]1CCCN(C1)[C@@H]1C=C(C[C@@H]([C@H]1NC(=O)C(C)C)n1cc(nn1)-c1ccncc1)C(O)=O",
    "CCCCOCc1cn(nn1)[C@H]1CC(=C[C@H]([C@@H]1NC(=O)C(C)C)N1CCC[C@@H](C1)OCCOC)C(O)=O",
    "COCCO[C@H]1CCCN(C1)[C@@H]1C=C(C[C@@H]([C@H]1NC(=O)C(C)C)n1cc(nn1)C1CCCCC1)C(O)=O",
    "COCCO[C@H]1CCCN(C1)[C@@H]1C=C(C[C@@H]([C@H]1NC(=O)C(C)C)n1cc(nn1)-c1ccc(cc1)C(C)(C)O)C(O)=O",
    "COCCO[C@H]1CCCN(C1)[C@@H]1C=C(C[C@@H]([C@H]1NC(=O)C(C)C)n1cc(nn1)-c1ccc(Cl)s1)C(O)=O",
    "CC(C)C(=O)N[C@H]1[C@@H](OC(=C[C@@H]1n1cc(nn1)-c1ccccc1)C(O)=O)[C@H](O)[C@H](O)CO"
]

# Combine the SMILES lists
all_smiles = smiles_list1 + smiles_list2

# Remove duplicates using canonical SMILES
unique_mols = {}
for smiles in all_smiles:
    mol = Chem.MolFromSmiles(smiles)
    if mol:  # Check if the SMILES is valid
        canon_smiles = Chem.MolToSmiles(mol, canonical=True)
        if canon_smiles not in unique_mols:
            unique_mols[canon_smiles] = mol

unique_mols_list = list(unique_mols.values())
print(f"Number of unique molecules: {len(unique_mols_list)}")


Number of unique molecules: 29


Clustering

In [10]:
fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
fps = [fpgen.GetFingerprint(mol) for mol in unique_mols_list]

# Define Tanimoto distance function
def tanimoto_dist(fp1, fp2):
    return 1 - TanimotoSimilarity(fp1, fp2)

# Perform Butina clustering (cutoff=0.3 means similarity >= 0.7)
cutoff = 0.3
clusters = Butina.ClusterData(
    data=fps,
    nPts=len(fps),
    distThresh=cutoff,
    isDistData=False,
    distFunc=tanimoto_dist
)
print(f"Number of clusters: {len(clusters)}")

Number of clusters: 5


Find common

In [15]:
# Find common structure for each cluster
common_structures = []
for cluster_idx, cluster in enumerate(clusters):
    cluster_mols = [unique_mols_list[i] for i in cluster]
    print(f"Cluster {cluster_idx + 1} contains {len(cluster_mols)} molecules")

    if len(cluster_mols) == 1:
        # Singleton cluster: use the molecule itself as the common structure
        common_structure = cluster_mols[0]
    else:
        # Multiple molecules: find the most common Murcko scaffold
        scaffolds = [MurckoScaffold.GetScaffoldForMol(mol) for mol in cluster_mols]
        scaffold_smiles = [Chem.MolToSmiles(scaf) for scaf in scaffolds if scaf]
        
        if scaffold_smiles:
            # Count scaffold occurrences and pick the most common one
            scaffold_counts = Counter(scaffold_smiles)
            most_common_scaffold_smiles = max(scaffold_counts.items(), key=lambda x: x[1])[0]
            common_structure = Chem.MolFromSmiles(most_common_scaffold_smiles)
        else:
            # Fallback: use the first molecule in the cluster if no scaffolds are found
            common_structure = cluster_mols[0]

    common_structures.append(common_structure)

# Output and visualize the common structures
print("\nCommon structures for each cluster:")
for i, common_mol in enumerate(common_structures):
    common_smiles = Chem.MolToSmiles(common_mol)
    print(f"Cluster {i + 1} common structure SMILES: {common_smiles}")
    
    # Visualize the common structure and save as PNG
    output_file = f"cluster_{i + 1}_common_structure.png"
    try:
        Draw.MolToFile(common_mol, output_file, size=(300, 300))
        print(f"Saved visualization for Cluster {i + 1} to {output_file}")
    except Exception as e:
        print(f"Failed to save visualization for Cluster {i + 1}: {e}")

# Verify the number of common structures matches the number of clusters
assert len(common_structures) == len(clusters), "Number of common structures does not match number of clusters"
print(f"Number of common structure SMILES output: {len(common_structures)}")

Cluster 1 contains 22 molecules
Cluster 2 contains 2 molecules
Cluster 3 contains 2 molecules
Cluster 4 contains 2 molecules
Cluster 5 contains 1 molecules

Common structures for each cluster:
Cluster 1 common structure SMILES: C1=C[C@@H](N2CCCCC2)C[C@@H](n2cc(-c3ccccc3)nn2)C1
Saved visualization for Cluster 1 to cluster_1_common_structure.png
Cluster 2 common structure SMILES: C1=C[C@@H](N2CCCCC2)C[C@@H](n2cc(-c3ccccc3)nn2)C1
Saved visualization for Cluster 2 to cluster_2_common_structure.png
Cluster 3 common structure SMILES: C1=C[C@@H](N2CCCCC2)C[C@@H](n2cc(-c3ccccc3)nn2)C1
Saved visualization for Cluster 3 to cluster_3_common_structure.png
Cluster 4 common structure SMILES: O=S(=O)(N[C@H]1CC=C[C@@H](N2CCCCC2)C1)c1ccccc1
Saved visualization for Cluster 4 to cluster_4_common_structure.png
Cluster 5 common structure SMILES: CC(C)C(=O)N[C@H]1[C@H]([C@H](O)[C@H](O)CO)OC(C(=O)O)=C[C@@H]1n1cc(-c2ccccc2)nn1
Saved visualization for Cluster 5 to cluster_5_common_structure.png
Number of commo