spiro_cycle_analysis

single_molecule_spiro_cycle_judge

In [None]:
from rdkit import Chem

def is_spiro_compound(smiles):
    # 从SMILES创建分子对象
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return False, None
    
    # 获取分子的环系信息
    ri = mol.GetRingInfo()
    
    # 获取所有原子
    atoms = mol.GetAtoms()
    
    for atom in atoms:
        # 获取该原子所在的所有环的索引
        ring_indices = ri.NumAtomRings(atom.GetIdx())
        if ring_indices == 2:
            # 获取包含该原子的两个环的原子索引
            rings = [r for r in ri.AtomRings() if atom.GetIdx() in r]
            if len(rings) == 2:
                # 检查这两个环是否没有其他共享原子
                shared_atoms = set(rings[0]) & set(rings[1])
                if len(shared_atoms) == 1 and list(shared_atoms)[0] == atom.GetIdx():
                    return True, (len(rings[0]), len(rings[1]))
    
    return False, None

# 示例使用
smiles = "C1([Si](C2=CC=CC=C2)(OC3CCCCC3)C4=CC=CC=C4)=CC=CC=C1"  # 螺环化合物的SMILES
result, rings_sizes = is_spiro_compound(smiles)
if result:
    ring1_size, ring2_size = rings_sizes
    print(f"The molecule is a spiro compound with rings of sizes {ring1_size} and {ring2_size}.")
else:
    print("The molecule is not a spiro compound.")

statistics_analysis of spiro_cycle

In [122]:
from rdkit import Chem
from collections import defaultdict

def is_spiro_compound(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return False, None
    
    ri = mol.GetRingInfo()
    for atom in mol.GetAtoms():
        ring_indices = ri.NumAtomRings(atom.GetIdx())
        if ring_indices == 2:
            rings = [r for r in ri.AtomRings() if atom.GetIdx() in r]
            if len(rings) == 2:
                shared_atoms = set(rings[0]) & set(rings[1])
                if len(shared_atoms) == 1 and list(shared_atoms)[0] == atom.GetIdx():
                    return True, (len(rings[0]), len(rings[1]))
    return False, None

def analyze_file(filename):
    results = defaultdict(list)
    total_spiro_count = 0

    with open(filename, 'r') as file:
        for line in file:
            smiles = line.strip()
            result, rings_sizes = is_spiro_compound(smiles)
            if result:
                total_spiro_count += 1
                key = f"Ring sizes {rings_sizes[0]} and {rings_sizes[1]}"
                results[key].append(smiles)
            else:
                results["Non-spiro compounds"].append(smiles)

    print("Summary of Spirocyclic Compounds:")
    for key, smiles_list in results.items():
        print(f"{key}: {len(smiles_list)} compounds")
        for smiles in smiles_list:
            print(f"  {smiles}")
        print()

    print(f"Total number of spiro compounds: {total_spiro_count}")

# Replace 'your_file.txt' with the path to your text file containing SMILES strings.
analyze_file('/home/sunhnayu/jupyterlab/XXI/img2smiles_xuexi/img2smiles/notebook/1_data_analysis/00_total/total_processed_1008_unique_standard_merged.txt')

Summary of Spirocyclic Compounds:
Non-spiro compounds: 7772 compounds
  Cc1c(C(=O)O)cccc1[N+](=O)[O-]
  Cc1c2ccc(N)cc2nn1C
  C=CC(=O)OC(C=C)[C@@H](CC)[C@H]1CCCCN2C(=O)CC[C@@H]12
  C=CC(=O)Nc1cc(Nc2ncc(C(=O)OC(C)C)c(-c3cn(C)c4ccccc34)n2)c(OC)cc1N(C)CCN(C)C
  BrCCc1c[nH]c2ccccc12
  COc1cccc(-c2cccc3c2C(=O)c2cc(C(C)C)cc(OC)c2C3=O)c1CSC
  CC(C)(C)[Si](C)(C)O[C@H]1CCC2=CC(=O)CC[C@@]21C
  CC[C@@H](Oc1cccc(CN(CCCOc2ccc(OC)cc2)c2nc3ccccc3o2)c1)C(=O)O
  Cc1ccccc1-c1cc(N2CCN(C)CC2)ncc1C(N)=O
  CC[C@H](C)[C@@H]([C@@H](CC(=O)N1CCC[C@H]1[C@H](OC)C(C)C(=O)N[C@@H](Cc1ccccc1)C(=O)O)OC)N(C)C(=O)[C@@H](CC(=O)[C@H](C(C)C)N(C)C(=O)CCCCCN1C(=O)C=CC1=O)C(C)C
  C=CC(O)c1ccc(OC)c(O)c1
  N#Cc1ccccc1CN
  Nc1ccn([C@@H]2O[C@H](CI)[C@@H](O)[C@@H]2F)c(=O)n1
  Cc1cc(F)ccc1Br
  COc1cc2c(Br)c(CBr)c3ccc(OC)c(OC)c3c2cc1OC
  O=C1c2ccccc2C(=O)N1CCO
  N#Cc1ccc(Nc2nc(Cl)c(Br)c(Cl)n2)cc1
  CC(C)n1c(=O)cc(Cl)[nH]c1=O
  CC[Si](CC)(CC)O[C@@]1(C)CC[C@H]2C(=O)CC[C@H]21
  CC(C)[C@@H]1CC[C@]2(C)C=C(I)C(=O)C[C@@H]12
  CC1(C)Oc2ccc3c

In [126]:
from rdkit import Chem
from collections import defaultdict

def is_spiro_compound(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return False
    
    ri = mol.GetRingInfo()
    for atom in mol.GetAtoms():
        ring_indices = ri.NumAtomRings(atom.GetIdx())
        if ring_indices == 2:
            rings = [r for r in ri.AtomRings() if atom.GetIdx() in r]
            if len(rings) == 2:
                shared_atoms = set(rings[0]) & set(rings[1])
                if len(shared_atoms) == 1 and list(shared_atoms)[0] == atom.GetIdx():
                    return True
    return False

def analyze_and_save_spiro_compounds(input_filename, output_spiro_filename):
    spiro_counts = defaultdict(int)
    total_count = 0

    with open(input_filename, 'r') as file, open(output_spiro_filename, 'w') as outfile:
        for line in file:
            smiles = line.strip()
            total_count += 1
            if is_spiro_compound(smiles):
                spiro_counts[smiles] += 1
                outfile.write(f"{smiles}\n")

    # 计算占比
    total_spiro_count = sum(spiro_counts.values())
    for smiles, count in spiro_counts.items():
       占比 = (count / total_count) * 100
    print(f"SMILES: {smiles}, Count: {count}, Percentage: {占比:.2f}%")

    print(f"Total number of compounds: {total_count}")
    print(f"Total number of spiro compounds: {total_spiro_count}")
    print(f"Percentage of spiro compounds: {(total_spiro_count / total_count) * 100:.2f}%")

# Replace 'input_file.txt' with the path to your input text file containing SMILES strings.
# Replace 'spiro_compounds.txt' with the desired output file path.
input_filename = '/home/sunhnayu/jupyterlab/XXI/img2smiles_xuexi/img2smiles/notebook/1_data_analysis/00_total/total_processed_1008_unique_standard_merged.txt'
output_spiro_filename = '/home/sunhnayu/jupyterlab/XXI/img2smiles_xuexi/img2smiles/notebook/1_data_analysis/00_total/spiro_compounds.txt'
analyze_and_save_spiro_compounds(input_filename, output_spiro_filename)


SMILES: COC[C@@]12CCCC(O)[C@H]1CC1(CC2)OCCO1, Count: 1, Percentage: 0.01%
Total number of compounds: 8185
Total number of spiro compounds: 413
Percentage of spiro compounds: 5.05%


calculate_the_duplicates

In [127]:
def merge_smiles_files(file1, file2, merged_file, duplicates_file):
    smiles_set = set()
    duplicates = set()

    # 读取第一个文件并添加到集合中
    with open(file1, 'r') as f1:
        for line in f1:
            smiles_set.add(line.strip())

    # 读取第二个文件
    with open(file2, 'r') as f2:
        for line in f2:
            smiles = line.strip()
            if smiles in smiles_set:
                duplicates.add(smiles)
            else:
                smiles_set.add(smiles)

    # 保存合并后的SMILES到总文件
    with open(merged_file, 'w') as f_merged:
        for smiles in smiles_set:
            f_merged.write(f"{smiles}\n")

    # 保存重复的SMILES到重复文件
    with open(duplicates_file, 'w') as f_duplicates:
        for smiles in duplicates:
            f_duplicates.write(f"{smiles}\n")

    return len(smiles_set), len(duplicates)

# 调用函数
file1 = '/home/sunhnayu/jupyterlab/XXI/img2smiles_xuexi/img2smiles/notebook/1_data_analysis/00_total/spiro_compounds.txt'
file2 = '/home/sunhnayu/jupyterlab/XXI/img2smiles_xuexi/img2smiles/notebook/1_data_analysis/00_total/bridged_smiles.txt'
merged_file = '/home/sunhnayu/jupyterlab/XXI/img2smiles_xuexi/img2smiles/notebook/1_data_analysis/00_total/S_and_B_merged_smiles.txt'
duplicates_file = '/home/sunhnayu/jupyterlab/XXI/img2smiles_xuexi/img2smiles/notebook/1_data_analysis/00_total/duplicates_smiles.txt'
total_count, duplicate_count = merge_smiles_files(file1, file2, merged_file, duplicates_file)

print(f"Total number of unique SMILES: {total_count}")
print(f"Number of duplicate SMILES: {duplicate_count}")

Total number of unique SMILES: 834
Number of duplicate SMILES: 26


bridge_cycles_judge

In [None]:
from rdkit import Chem

def find_bridged_rings(smiles):
    # 从SMILES创建分子对象
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return False, None
    
    # 获取分子的环系信息
    ri = mol.GetRingInfo()
    
    # 获取所有环
    all_rings = ri.AtomRings()
    
    for ring1 in all_rings:
        for ring2 in all_rings:
            if ring1 is not ring2:
                # 检查两个环是否共享三个或更多的原子
                shared_atoms = set(ring1) & set(ring2)
                if len(shared_atoms) >= 3:
                    # 计算两个环的大小
                    ring1_size = len(ring1)
                    ring2_size = len(ring2)
                    return True, (ring1_size, ring2_size)
    
    return False, None

# 示例使用
smiles = "[C@@H]1(C2)CCCC[C@@H]2CCCC1"  # 桥环化合物的SMILES
is_bridged, ring_sizes = find_bridged_rings(smiles)
if is_bridged:
    print(f"The molecule with SMILES {smiles} is a bridged compound with rings of sizes {ring_sizes[0]} and {ring_sizes[1]}.")
else:
    print(f"The molecule with SMILES {smiles} is not a bridged compound.")

bridged_cycle_analysis

In [1]:
from rdkit import Chem
from collections import defaultdict

def find_bridged_rings(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return False, None
    
    ri = mol.GetRingInfo()
    all_rings = ri.AtomRings()
    
    for ring1 in all_rings:
        for ring2 in all_rings:
            if ring1 is not ring2:
                shared_atoms = set(ring1) & set(ring2)
                if len(shared_atoms) >= 3:
                    ring1_size = len(ring1)
                    ring2_size = len(ring2)
                    return True, (ring1_size, ring2_size)
    
    return False, None

def analyze_file(filename):
    bridged_stats = defaultdict(lambda: {'count': 0, 'sizes': []})
    total_smiles = 0
    total_bridged = 0
    bridged_smiles_file = '/home/sunhnayu/jupyterlab/XXI/img2smiles_xuexi/img2smiles/notebook/1_data_analysis/00_total/bridged_smiles.txt'
    
    with open(filename, 'r') as file:
        for line in file:
            smiles = line.strip()
            total_smiles += 1
            result, ring_sizes = find_bridged_rings(smiles)
            if result:
                total_bridged += 1
                key = f"{min(ring_sizes)} and {max(ring_sizes)}"
                bridged_stats[key]['count'] += 1
                bridged_stats[key]['sizes'].append(smiles)
                with open(bridged_smiles_file, 'a') as smiles_file:
                    smiles_file.write(f"{smiles} ({key})\n")
    
    # Print individual bridged compound stats
    print("Analysis of Bridged Compounds:")
    for key, stats in bridged_stats.items():
        percentage = (stats['count'] / total_bridged) * 100 if total_bridged else 0
        print(f"Ring sizes {key}: {stats['count']} compounds ({percentage:.2f}%)")
    
    print("\nBridged compounds saved to: bridged_smiles.txt")
    
    # Print total stats
    print(f"\nTotal number of smiles: {total_smiles}")
    print(f"Total number of bridged compounds: {total_bridged}")
    print(f"Percentage of bridged compounds: {(total_bridged / total_smiles) * 100:.2f}%")
    
    # Print categorized stats
    print("\nCategorized Bridged Compound Stats:")
    for key, stats in bridged_stats.items():
        print(f"Ring sizes {key}: {stats['count']} compounds")

        # Calculate and print percentage of each category
        percentage = (stats['count'] / total_bridged) * 100
        print(f"Percentage of ring sizes {key}: {percentage:.2f}%")

# Replace 'your_file.txt' with the path to your text file containing SMILES strings.
analyze_file('/home/sunhnayu/jupyterlab/XXI/img2smiles_xuexi/img2smiles/notebook/1_data_analysis/00_total/total_processed_1008_unique_standard_merged.txt')

Analysis of Bridged Compounds:
Ring sizes 5 and 5: 50 compounds (11.19%)
Ring sizes 6 and 6: 142 compounds (31.77%)
Ring sizes 6 and 9: 18 compounds (4.03%)
Ring sizes 5 and 6: 136 compounds (30.43%)
Ring sizes 6 and 12: 2 compounds (0.45%)
Ring sizes 5 and 7: 7 compounds (1.57%)
Ring sizes 5 and 20: 2 compounds (0.45%)
Ring sizes 8 and 9: 9 compounds (2.01%)
Ring sizes 5 and 17: 3 compounds (0.67%)
Ring sizes 5 and 9: 4 compounds (0.89%)
Ring sizes 5 and 18: 6 compounds (1.34%)
Ring sizes 4 and 6: 21 compounds (4.70%)
Ring sizes 6 and 22: 3 compounds (0.67%)
Ring sizes 6 and 7: 9 compounds (2.01%)
Ring sizes 5 and 12: 6 compounds (1.34%)
Ring sizes 5 and 23: 4 compounds (0.89%)
Ring sizes 6 and 16: 1 compounds (0.22%)
Ring sizes 6 and 8: 5 compounds (1.12%)
Ring sizes 6 and 18: 4 compounds (0.89%)
Ring sizes 7 and 11: 1 compounds (0.22%)
Ring sizes 6 and 10: 9 compounds (2.01%)
Ring sizes 15 and 16: 1 compounds (0.22%)
Ring sizes 6 and 13: 3 compounds (0.67%)
Ring sizes 6 and 11: 1 co