In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
import matplotlib.pyplot as plt
import numpy as np

# read SMILES
def read_smiles(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file if line.strip()]

# calculate the fingerprints
def calculate_fingerprints(smiles_list):
    fingerprints = []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)
            fingerprints.append(fingerprint)
    return fingerprints

# transfer the fingerprints into numpy array
def convert_fingerprints_to_array(fingerprints):
    array = np.array([list(fingerprint) for fingerprint in fingerprints], dtype=np.uint8)
    array[np.isinf(array) | np.isnan(array)] = 0
    return array

# read smiles from txt
smiles_group_a = read_smiles('target_file_a.txt')
smiles_group_b = read_smiles('target_file_b.txt')

# calculate the fingerprints of two grups
fingerprints_a = calculate_fingerprints(smiles_group_a)
fingerprints_b = calculate_fingerprints(smiles_group_b)

# transfer the fingerprints into numpy array
fingerprints_a_array = convert_fingerprints_to_array(fingerprints_a)
fingerprints_b_array = convert_fingerprints_to_array(fingerprints_b)

# merge two groups
fingerprints_combined = np.vstack((fingerprints_a_array, fingerprints_b_array))

color_group_a = (1, 0, 0, 0.6)  
color_group_b = (0, 0, 1, 0.5)  
point_size = 15  # the size of the point

# PCA analysis
pca = PCA(n_components=2)
pca_result = pca.fit_transform(fingerprints_combined)

# t-SNE analysis
tsne = TSNE(n_components=2, random_state=0)
tsne_results_2d = tsne.fit_transform(fingerprints_combined)

# UMAP analysis
umap_2d = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
umap_results_2d = umap_2d.fit_transform(fingerprints_combined)

# draw_PCA
plt.figure(figsize=(10, 7))
plt.scatter(pca_result[:len(fingerprints_a_array), 0], pca_result[:len(fingerprints_a_array), 1], s=point_size, c=color_group_a, label='Group A')
plt.scatter(pca_result[len(fingerprints_a_array):, 0], pca_result[len(fingerprints_a_array):, 1], s=point_size, c=color_group_b, label='Group B')
plt.title('PCA of Molecules')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()

# draw 2D t-SNE
plt.figure(figsize=(10, 7))
plt.scatter(tsne_results_2d[:len(fingerprints_a_array), 0], tsne_results_2d[:len(fingerprints_a_array), 1], s=point_size, c=color_group_a, label='USPTO-50k')
plt.scatter(tsne_results_2d[len(fingerprints_a_array):, 0], tsne_results_2d[len(fingerprints_a_array):, 1], s=point_size, c=color_group_b, label='MedRDB')
plt.legend()
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
ax = plt.gca()  
ax.spines['top'].set_visible(False)  
ax.spines['right'].set_visible(False)  
plt.xticks([])  
plt.yticks([])  
plt.show()

# draw 2D UMAP
plt.figure(figsize=(10, 7))
plt.scatter(umap_results_2d[:len(fingerprints_a_array), 0], umap_results_2d[:len(fingerprints_a_array), 1], s=point_size, c=color_group_a, label='USPTO-50k')
plt.scatter(umap_results_2d[len(fingerprints_a_array):, 0], umap_results_2d[len(fingerprints_a_array):, 1], s=point_size, c=color_group_b, label='MedRDB')
plt.legend()
plt.xlabel('UMAP 1')
plt.ylabel('UMAP 2')
ax = plt.gca()  
ax.spines['top'].set_visible(False)  
ax.spines['right'].set_visible(False)  
plt.xticks([]) 
plt.yticks([])  
plt.show()