In [None]:
## Need to create code to find 75/4 molecules, to see how training impression on 75 compares to 15 training molecules
## Search for the least similar molecules in the output folder


In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
import os
import heapq

# Function to compute Tanimoto similarity between two fingerprints
def tanimoto_similarity(fp1, fp2):
    return DataStructs.FingerprintSimilarity(fp1, fp2)

# Load molecules from SDF or MOL files in the directory
def load_molecules(folder):
    molecules = []
    for filename in os.listdir(folder):
        if filename.endswith(".sdf") or filename.endswith(".mol"):
            mol = Chem.MolFromMolFile(os.path.join(folder, filename))
            if mol:
                molecules.append((filename, mol))
    return molecules

# Compute fingerprints for all molecules
def compute_fingerprints(molecules):
    fingerprints = {}
    for name, mol in molecules:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
        fingerprints[name] = fp
    return fingerprints

# Calculate average similarity for each molecule
def calculate_dissimilarity(fingerprints):
    similarity_scores = {}
    names = list(fingerprints.keys())
    
    for i, name1 in enumerate(names):
        total_similarity = 0
        count = 0
        for j, name2 in enumerate(names):
            if i != j:
                total_similarity += tanimoto_similarity(fingerprints[name1], fingerprints[name2])
                count += 1
        similarity_scores[name1] = total_similarity / count if count > 0 else 0
    
    return similarity_scores

# Main function
def main():
    folder = r"C:\\Project_code\\OUTPUT\\90%"
    molecules = load_molecules(folder)
    fingerprints = compute_fingerprints(molecules)
    similarity_scores = calculate_dissimilarity(fingerprints)
    
    # Find 15 molecules with lowest average similarity (most dissimilar)
    most_dissimilar = heapq.nsmallest(15, similarity_scores, key=similarity_scores.get)
    
    print("15 Most Dissimilar Molecules:")
    for name in most_dissimilar:
        print(name, "- Avg Similarity:", similarity_scores[name])

if __name__ == "__main__":
    main()


[11:48:12] Explicit valence for atom # 13 N, 4, is greater than permitted
[11:48:13] Explicit valence for atom # 6 N, 4, is greater than permitted


15 Most Dissimilar Molecules:
molecule_56.nmredata.sdf - Avg Similarity: 0.04187123966454258
molecule_30.nmredata.sdf - Avg Similarity: 0.05026263484829218
molecule_103.nmredata.sdf - Avg Similarity: 0.0611740334431551
molecule_66.nmredata.sdf - Avg Similarity: 0.06184803298531282
molecule_68.nmredata.sdf - Avg Similarity: 0.06229742429814011
molecule_79.nmredata.sdf - Avg Similarity: 0.06472954636397914
molecule_57.nmredata.sdf - Avg Similarity: 0.06895713058251941
molecule_118.nmredata.sdf - Avg Similarity: 0.07066210668683696
molecule_33.nmredata.sdf - Avg Similarity: 0.07506433125840502
molecule_94.nmredata.sdf - Avg Similarity: 0.07607815133637981
molecule_80.nmredata.sdf - Avg Similarity: 0.07714809948194963
molecule_13.nmredata.sdf - Avg Similarity: 0.07743748896680828
molecule_108.nmredata.sdf - Avg Similarity: 0.07988702463887425
molecule_49.nmredata.sdf - Avg Similarity: 0.08337940615364914
molecule_77.nmredata.sdf - Avg Similarity: 0.08412128657022896




In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
import os
import heapq

# Function to compute Tanimoto similarity between two fingerprints
def tanimoto_similarity(fp1, fp2):
    return DataStructs.FingerprintSimilarity(fp1, fp2)

# Load molecules from SDF or MOL files in the directory
def load_molecules(folder):
    molecules = []
    for filename in os.listdir(folder):
        if filename.endswith(".sdf") or filename.endswith(".mol"):
            mol = Chem.MolFromMolFile(os.path.join(folder, filename))
            if mol:
                molecules.append((filename, mol))
    return molecules

# Compute fingerprints for all molecules
def compute_fingerprints(molecules):
    fingerprints = {}
    for name, mol in molecules:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
        fingerprints[name] = fp
    return fingerprints

# Calculate average similarity for each molecule
def calculate_dissimilarity(fingerprints):
    similarity_scores = {}
    names = list(fingerprints.keys())
    
    for i, name1 in enumerate(names):
        total_similarity = 0
        count = 0
        for j, name2 in enumerate(names):
            if i != j:
                total_similarity += tanimoto_similarity(fingerprints[name1], fingerprints[name2])
                count += 1
        similarity_scores[name1] = total_similarity / count if count > 0 else 0
    
    return similarity_scores

# Main function
def main():
    folder = r"C:\\Project_code\\OUTPUT\\90%"
    molecules = load_molecules(folder)
    fingerprints = compute_fingerprints(molecules)
    similarity_scores = calculate_dissimilarity(fingerprints)
    
    # Find 15 molecules with lowest average similarity (most dissimilar)
    most_dissimilar = heapq.nsmallest(15, similarity_scores, key=similarity_scores.get)
    
    print("15 Most Dissimilar Molecules:")
    for name in most_dissimilar:
        print(name, "- Avg Similarity:", similarity_scores[name])

import shutil  # For copying files

# Create directory if it doesn't exist
def create_output_folder(folder):
    if not os.path.exists(folder):
        os.makedirs(folder)

# Copy least similar molecules to new folder
def copy_dissimilar_molecules(most_dissimilar, source_folder, target_folder):
    create_output_folder(target_folder)
    for name in most_dissimilar:
        src = os.path.join(source_folder, name)
        dst = os.path.join(target_folder, name)
        shutil.copy(src, dst)

if __name__ == "__main__":
    folder = r"C:\Project_code\OUTPUT\90%"
    target_folder = r"C:\Project_code\15_molecules"
    molecules = load_molecules(folder)
    fingerprints = compute_fingerprints(molecules)
    similarity_scores = calculate_dissimilarity(fingerprints)
    
    # Find 15 molecules with lowest average similarity (most dissimilar)
    most_dissimilar = heapq.nsmallest(15, similarity_scores, key=similarity_scores.get)
    
    print("15 Most Dissimilar Molecules:")
    for name in most_dissimilar:
        print(name, "- Avg Similarity:", similarity_scores[name])
    
    # Copy least similar molecules to the new folder
    copy_dissimilar_molecules(most_dissimilar, folder, target_folder)()


[18:09:25] Explicit valence for atom # 13 N, 4, is greater than permitted
[18:09:25] Explicit valence for atom # 6 N, 4, is greater than permitted


15 Most Dissimilar Molecules:
molecule_56.nmredata.sdf - Avg Similarity: 0.04187123966454258
molecule_30.nmredata.sdf - Avg Similarity: 0.05026263484829218
molecule_103.nmredata.sdf - Avg Similarity: 0.0611740334431551
molecule_66.nmredata.sdf - Avg Similarity: 0.06184803298531282
molecule_68.nmredata.sdf - Avg Similarity: 0.06229742429814011
molecule_79.nmredata.sdf - Avg Similarity: 0.06472954636397914
molecule_57.nmredata.sdf - Avg Similarity: 0.06895713058251941
molecule_118.nmredata.sdf - Avg Similarity: 0.07066210668683696
molecule_33.nmredata.sdf - Avg Similarity: 0.07506433125840502
molecule_94.nmredata.sdf - Avg Similarity: 0.07607815133637981
molecule_80.nmredata.sdf - Avg Similarity: 0.07714809948194963
molecule_13.nmredata.sdf - Avg Similarity: 0.07743748896680828
molecule_108.nmredata.sdf - Avg Similarity: 0.07988702463887425
molecule_49.nmredata.sdf - Avg Similarity: 0.08337940615364914
molecule_77.nmredata.sdf - Avg Similarity: 0.08412128657022896


TypeError: 'NoneType' object is not callable