In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np

# 读取两个文本文件中的SMILES
def read_smiles(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file if line.strip()]

# 计算分子指纹
def calculate_fingerprints(smiles_list):
    fingerprints = []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)
            fingerprints.append(fingerprint)
    return fingerprints

# 读取两个文本文件
smiles_group_a = read_smiles('/home/sunhnayu/jupyterlab/XXI/img2smiles_xuexi/img2smiles/notebook/data_sample/USPTO_full_random_8196_smiles.txt')
smiles_group_b = read_smiles('/home/sunhnayu/jupyterlab/XXI/img2smiles_xuexi/img2smiles/notebook/data_sample/final_unique_standard_merged.txt')

# 计算两组分子的指纹
fingerprints_a = calculate_fingerprints(smiles_group_a)
fingerprints_b = calculate_fingerprints(smiles_group_b)

# 将指纹转换为numpy数组，并替换无效值为0
def convert_fingerprints_to_array(fingerprints):
    array = np.array([list(fingerprint) for fingerprint in fingerprints], dtype=np.uint8)
    array[np.isinf(array) | np.isnan(array)] = 0
    return array

fingerprints_a_array = convert_fingerprints_to_array(fingerprints_a)
fingerprints_b_array = convert_fingerprints_to_array(fingerprints_b)

# 将两组数据合并
fingerprints_combined = np.vstack((fingerprints_a_array, fingerprints_b_array))

# 标记两组数据
labels_combined = ['A'] * len(fingerprints_a) + ['B'] * len(fingerprints_b)

# t-SNE 分析（二维）
tsne = TSNE(n_components=2, random_state=0)
tsne_results_2d = tsne.fit_transform(fingerprints_combined)

# UMAP 分析（二维）
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
umap_results_2d = umap_model.fit_transform(fingerprints_combined)

# # UMAP 分析（三维）
# umap_results_3d = umap_model.fit_transform(fingerprints_combined, n_components=3)

# 自定义颜色和点的大小
color_group_a = (1, 0, 0, 0.6)  # 红色，半透明
color_group_b = (0, 0, 1, 0.5)  # 蓝色，半透明
point_size = 15  # 点的大小

# 绘制2D t-SNE分布图
plt.figure(figsize=(10, 7))
plt.scatter(tsne_results_2d[:len(fingerprints_a), 0], tsne_results_2d[:len(fingerprints_a), 1], s=point_size, c=color_group_a, label='USPTO-FULL')
plt.scatter(tsne_results_2d[len(fingerprints_a):, 0], tsne_results_2d[len(fingerprints_a):, 1], s=point_size, c=color_group_b, label='MedRDB')
plt.legend()
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
ax = plt.gca()  
ax.spines['top'].set_visible(False)  
ax.spines['right'].set_visible(False)  
plt.xticks([])  
plt.yticks([])  
plt.show()

# plt.title('t-SNE of Molecules (2D)')
# plt.xlabel('t-SNE Feature 1')
# plt.ylabel('t-SNE Feature 2')
# plt.legend()
# plt.show()

# 绘制2D UMAP分布图
plt.figure(figsize=(10, 7))
plt.scatter(umap_results_2d[:len(fingerprints_a_array), 0], umap_results_2d[:len(fingerprints_a_array), 1], s=point_size, c=color_group_a, label='USPTO-FULL')
plt.scatter(umap_results_2d[len(fingerprints_a_array):, 0], umap_results_2d[len(fingerprints_a_array):, 1], s=point_size, c=color_group_b, label='MedRDB')
plt.legend()
plt.xlabel('UMAP 1')
plt.ylabel('UMAP 2')
ax = plt.gca()  
ax.spines['top'].set_visible(False)  
ax.spines['right'].set_visible(False)  
plt.xticks([]) 
plt.yticks([])  
plt.show()
