In [1]:
import os
from typing import List

import numpy as np
import pandas as pd
import torch
from rdkit.Chem.rdmolfiles import MolFromSmarts
from src.data.components.utils import (
    smiles2vector_fg,
    smiles2vector_mfg,
    standardize_smiles,
)
from src.models.fgr_module import FGRPretrainLitModule
from tokenizers import Tokenizer

In [3]:
import sys
sys.path.append("../../")

In [4]:
model = FGRPretrainLitModule.load_from_checkpoint(
    "/home/rajeeva/Project/outputs/epoch_000_val_0.8505.ckpt"
)
model.eval();

/home/rajeeva/conda/envs/DL/lib/python3.11/site-packages/lightning/pytorch/utilities/parsing.py:198: Attribute 'net' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['net'])`.
/home/rajeeva/conda/envs/DL/lib/python3.11/site-packages/lightning/pytorch/utilities/parsing.py:198: Attribute 'recon_loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['recon_loss'])`.


In [5]:
fgroups = pd.read_parquet("fg.parquet")["SMARTS"].tolist()  # Get functional groups
fgroups_list = [MolFromSmarts(x) for x in fgroups]  # Convert to RDKit Mol
tokenizer = Tokenizer.from_file(
    os.path.join(
        "tokenizers",
        f"BPE_pubchem_{500}.json",
    )
)  # Load tokenizer

In [6]:
def get_representation(
    smiles: List[str],
    method: str,
    fgroups_list: List[MolFromSmarts],
    tokenizer: Tokenizer,
) -> np.ndarray:
    smiles = [standardize_smiles(smi) for smi in smiles]  # Standardize smiles
    if method == "FG":
        x = np.stack([smiles2vector_fg(x, fgroups_list) for x in smiles])
    elif method == "MFG":
        x = np.stack([smiles2vector_mfg(x, tokenizer) for x in smiles])
    elif method == "FGR":
        f_g = np.stack([smiles2vector_fg(x, fgroups_list) for x in smiles])
        mfg = np.stack([smiles2vector_mfg(x, tokenizer) for x in smiles])
        x = np.concatenate((f_g, mfg), axis=1)  # Concatenate both vectors
    else:
        raise ValueError("Method not supported")  # Raise error if method not supported
    return x

In [7]:
x = get_representation(
    ["CC(C)(C)NCC(O)c1cc(Cl)c(N)c(c1)C(F)(F)F", "CCN", "CCF"], "FGR", fgroups_list, tokenizer
)
x = torch.tensor(x, dtype=torch.float32, device=model.device)
z_d = model(x)

In [8]:
z_d[0].shape

torch.Size([3, 256])

In [9]:
df = pd.read_csv("./smiles_cleaned.csv")
df

Unnamed: 0,compound_name,SMILES
0,"1,3-diallylurea",C1=CC=C2C(=C1)C(=CC(=C2N)N=NC3=CC=C(C=C3)C4=CC...
1,"1,4-dimethylendothall",CC12CCC(O1)(C(C2C(=O)O)C(=O)O)C
2,"1,7-octadiene",C=CCCCCC=C
3,"1,8-nonadiene",C=CCCCCCC=C
4,2-cyanoethyl acrylate,C=CC(=O)OCCC#N
...,...,...
146,fluorocytosine,C1=NC(=O)NC(=C1F)N
147,hydroquinone,C1=CC(=CC=C1O)O
148,copper,[Cu]
149,CaCl2,[Cl-].[Cl-].[Ca+2]


In [10]:
# all_smiles = df["smiles_cleaned"].to_list()
all_smiles = df['SMILES'].to_list()

In [11]:
rep = get_representation(all_smiles, "FGR", fgroups_list, tokenizer)

In [12]:
rep.shape

(151, 32672)

In [13]:
len(all_smiles)

151

In [24]:
z = model(torch.from_numpy(rep).to(model.device))

In [26]:
z[0].shape

torch.Size([151, 256])

In [28]:
latent_df = pd.DataFrame(data=z[0].cpu().detach().numpy(), columns=[f"latent_{i}" for i in range(256)])  

In [31]:
latent_df = df[['compound_name']].rename(columns={'compound_name': 'Condition'}).join(latent_df)

In [33]:
# latent_df.to_csv("../../../data/latent_pubchem_256.csv", index=False)

In [14]:
latent_df = pd.DataFrame(data=rep, columns=[f"chem_{i}" for i in range(32672)])
# latent_df = pd.concat((df[['Condition', 'smiles_cleaned']].dropna(axis=0), latent_df),
#   ignore_index=True, axis =1)

In [None]:
df[["Condition"]].join(latent_df).to_parquet(
    "../../data/cancer/ChemGroup.parquet", compression="zstd"
)

In [15]:
latent_df["chem_1"].unique()

array([0.], dtype=float32)

In [20]:
help(torch.from_numpy)

Help on built-in function from_numpy in module torch:

from_numpy(...)
    from_numpy(ndarray) -> Tensor
    
    Creates a :class:`Tensor` from a :class:`numpy.ndarray`.
    
    The returned tensor and :attr:`ndarray` share the same memory. Modifications to
    the tensor will be reflected in the :attr:`ndarray` and vice versa. The returned
    tensor is not resizable.
    
    It currently accepts :attr:`ndarray` with dtypes of ``numpy.float64``,
    ``numpy.float32``, ``numpy.float16``, ``numpy.complex64``, ``numpy.complex128``,
    ``numpy.int64``, ``numpy.int32``, ``numpy.int16``, ``numpy.int8``, ``numpy.uint8``,
    and ``numpy.bool``.
    
        Writing to a tensor created from a read-only NumPy array is not supported and will result in undefined behavior.
    
    Example::
    
        >>> a = numpy.array([1, 2, 3])
        >>> t = torch.from_numpy(a)
        >>> t
        tensor([ 1,  2,  3])
        >>> t[0] = -1
        >>> a
        array([-1,  2,  3])

