In [1]:
# Install required package
!pip install tmap

# Import required libraries
import pandas as pd
import tmap
from faerun import Faerun
from mhfp.encoder import MHFPEncoder
from rdkit import Chem
from rdkit.Chem import AllChem
import matplotlib.pyplot as plt
import numpy as np

# The number of permutations used by the MinHashing algorithm
perm = 1024

# Initialize the MHFP encoder
enc = MHFPEncoder(perm)

# Load datasets
df1 = pd.read_csv("./Data/Pretraining_dataset/SSP1.csv")
df1.rename(columns={'smiles': 'SMILES'}, inplace=True)
df1['yield'] = 1  # Assign yield = 1 for this dataset
df1 = df1.iloc[:50000, :10]  # Take first 50,000 rows and first 10 columns

df2 = pd.read_csv("./Data/Pretraining_dataset/ChemBL-LM_train.csv")
df2.drop(columns=['canonical'], inplace=True)  # Drop the 'canonical' column
df2['yield'] = 0  # Assign yield = 0 for this dataset
df2 = df2.iloc[:500000, :10]  # Take first 500,000 rows and first 10 columns

# Combine datasets
df = pd.concat([df2, df1], ignore_index=True)

# Function to check if a SMILES string is valid
def is_valid_smiles(smiles):
    return Chem.MolFromSmiles(smiles) is not None

# Filter out invalid SMILES strings
df = df[df['SMILES'].apply(is_valid_smiles)]

# Create MHFP fingerprints from valid SMILES
fingerprints = [tmap.VectorUint(enc.encode(s)) for s in df["SMILES"]]

# Initialize the LSH Forest
lf = tmap.LSHForest(perm)

# Add the fingerprints to the LSH Forest and index
lf.batch_add(fingerprints)
lf.index()

# Get the coordinates
x, y, s, t, _ = tmap.layout_from_lsh_forest(lf)

# Print the coordinates (or use them for further analysis)
print("Coordinates:", x, y, s, t)






















IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [25]:
# Create a Faerun plot for the ESOL dataset
faerun = Faerun(view="front", coords=False, clear_color='#FFFFEB')

# Add scatter points with colors based on "ESOL (mol/L)" values
faerun.add_scatter(
    "ESOL_Basic",
    {
        "x": x,
        "y": y,
        "c": df['yield'],  # Use df['yield'] directly
        "labels": df["SMILES"]
    },
    point_scale=3,
    colormap=['brg_r'],
    has_legend=True,
    legend_title=['dataset'],
    categorical=[False],
    shader='smoothCircle'
)


# Add a tree structure connecting related points
faerun.add_tree("ESOL_Basic_tree", {"from": s, "to": t}, point_helper="ESOL_Basic")

# Choose the "smiles" template to display molecular structure on hover
faerun.plot('chembl', template="smiles", notebook_height=750)

In [3]:
import numpy as np

# Convert to a dictionary for saving
coordinates_dict = {'x': x, 'y': y, 's': s, 't': t}

# Save as a .npz file
np.savez("/raid/aiccg/rbsunoj/Nupur/tmap/tmap_chembl_ssp1_coordinates_output.npz", **coordinates_dict)

print("Coordinates saved to /raid/aiccg/rbsunoj/supratim/coordinates_output.npz")


Coordinates saved to /raid/aiccg/rbsunoj/supratim/coordinates_output.npz
