In [None]:
import json
from rdkit import Chem
from rdkit.Chem import SpacialScore
import matplotlib.pyplot as plt
import numpy as np
from scipy.interpolate import interp1d

# 读取逐行字典的JSON文件
def read_json_file(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data

# 计算SPS指数
def calculate_sps_indices(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        sps_index = SpacialScore.SPS(mol, normalize=True)
        sps_index_unnormalized = SpacialScore.SPS(mol, normalize=False)
        return sps_index, sps_index_unnormalized
    else:
        return None, None

# 提取SMILES并计算SPS指数
def calculate_sps_indices_of_smiles(data, key):
    sps_indices = []
    sps_indices_unnormalized = []
    for entry in data:
        smiles_list = entry.get(key, [])
        for smiles in smiles_list:
            sps_index, sps_index_unnormalized = calculate_sps_indices(smiles)
            if sps_index is not None:
                sps_indices.append(sps_index)
                sps_indices_unnormalized.append(sps_index_unnormalized)
    return sps_indices, sps_indices_unnormalized

# 绘制平滑的频率分布曲线图
def plot_sps_distribution(sps_indices1, sps_indices2, label1, label2, title):
    plt.figure(figsize=(10, 6))

    # 计算频率分布并裁剪负值
    hist1, bin_edges1 = np.histogram(sps_indices1, bins=20, density=True)
    hist1 = np.clip(hist1, 0, None)  # 裁剪负值
    bin_centers1 = 0.5 * (bin_edges1[1:] + bin_edges1[:-1])

    hist2, bin_edges2 = np.histogram(sps_indices2, bins=20, density=True)
    hist2 = np.clip(hist2, 0, None)  # 裁剪负值
    bin_centers2 = 0.5 * (bin_edges2[1:] + bin_edges2[:-1])

    # 插值
    interp_func1 = interp1d(bin_centers1, hist1, kind='cubic', fill_value="extrapolate")
    interp_func2 = interp1d(bin_centers2, hist2, kind='cubic', fill_value="extrapolate")
    x1 = np.linspace(min(bin_centers1), max(bin_centers1), 1000)
    x2 = np.linspace(min(bin_centers2), max(bin_centers2), 1000)
    y1 = interp_func1(x1)
    y2 = interp_func2(x2)

    # 裁剪插值结果中的负值
    y1 = np.clip(y1, 0, None)
    y2 = np.clip(y2, 0, None)

    plt.plot(x1, y1, label=label1, color='blue')
    plt.plot(x2, y2, label=label2, color='green')
    plt.title(title)
    plt.xlabel('nSPS Value')
    plt.ylabel('Probability Density')
    plt.legend()

    # 去掉上方和右边的框线
    ax = plt.gca()
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)

    plt.show()

# 主函数
def main():
    file_path1 = "data_1.json"  # 第一个JSON文件路径
    file_path2 = 'data_2.json'  # 第二个JSON文件路径

    data1 = read_json_file(file_path1)
    data2 = read_json_file(file_path2)

    sps_indices1, sps_indices_unnormalized1 = calculate_sps_indices_of_smiles(data1, "products")
    sps_indices2, sps_indices_unnormalized2 = calculate_sps_indices_of_smiles(data2, "smiles")

    # 绘制 "products" 和 "smiles" 的 sps_index
    plot_sps_distribution(sps_indices1, sps_indices2, 'MedRDB', 'USPTO-50k', 'Distribution of nSPS of MedRDB and USPTO-50k')

    # 绘制 "products" 和 "smiles" 的 sps_index_unnormalized
    plot_sps_distribution(sps_indices_unnormalized1, sps_indices_unnormalized2, 'Products', 'Smiles', 'Unnormalized SPS Index Distribution')

if __name__ == '__main__':
    main()