In [None]:
# Filter_SMILES

import json
from rdkit import Chem


def process_json_line(line):
    try:

        data = json.loads(line)

        sub_smiles_list = data.get('sub_smiles', [])
        smiles_list = data.get('smiles', [])

        all_smiles = sub_smiles_list + smiles_list

        filtered_smiles = []
        for smi in all_smiles:
            mol = Chem.MolFromSmiles(smi)
            if mol is not None and mol.GetNumHeavyAtoms() >= 4:
                filtered_smiles.append(smi)
        return filtered_smiles
    except json.JSONDecodeError:
        print(f"Error decoding JSON from line: {line}")
        return []


input_file = "dataset_preparation/USPTO_full_merged_output.json"

filtered_output_file = 'dataset_sample/filtered_smiles.txt'
unique_output_file = 'dataset_sample/unique_smiles.txt'


filtered_smiles = []
with open(input_file, 'r') as infile:
    for line in infile:
        filtered_smiles.extend(process_json_line(line))


unique_smiles = list(set(filtered_smiles))


with open(filtered_output_file, 'w') as f:
    for smi in filtered_smiles:
        f.write(smi + '\n')


with open(unique_output_file, 'w') as f:
    for smi in unique_smiles:
        f.write(smi + '\n')

print(f"处理完成，结果已保存到{filtered_output_file}和{unique_output_file}文件中。")

In [None]:
# random sampling
import random

# input_file_name
filtered_file = 'dataset_sample/unique_smiles.txt'
# output_file_name
random_output_file = 'dataset_sample/USPTO_full_random_8196_smiles.txt'

# open_smiles
with open(filtered_file, 'r') as infile:
    smiles_list = infile.read().splitlines()

# make_sure_enough_number
if len(smiles_list) < 8196:
    raise ValueError(f"文件 {filtered_file} 中的SMILES式数量不足8196个，只有 {len(smiles_list)} 个。")

# random_selection
random_smiles = random.sample(smiles_list, 8196)

# save_into_file
with open(random_output_file, 'w') as outfile:
    for smi in random_smiles:
        outfile.write(smi + '\n')

print(f"随机选取的8196个SMILES式已保存到{random_output_file}文件中。")