## LSTM is better for small datsets and I have Nan values too


| Step                        | Purpose                                                         |
| --------------------------- | --------------------------------------------------------------- |
| 1. **Tokenization**         | Convert SMILES string into a sequence of characters or subwords |
| 2. **Embedding Layer**      | Map each token to a dense vector                                |
| 3. **LSTM Encoder**         | Learn sequential patterns of the molecule                       |
| 4. **Pooling / Projection** | Reduce sequence output to a fixed-size embedding                |
| 5. **Save/Use Embedding**   | Use for downstream models (e.g., hybrid model, fusion, MLP)     |


In [2]:
import pandas as pd 
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
#from rdkit.Chem.rdFingerprintGenerator import MorganFingerprintGenerator
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


In [3]:
import torch

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
import rdkit.Chem.rdFingerprintGenerator as rdFingerprintGenerator

#from rdkit.Chem.rdFingerprintGenerator import MorganFingerprintGenerator


### Configuration

In [4]:
import yaml

# Path to your config file
CONFIG_PATH = r"D:\Skills\new\NeurIPS2\config.yaml"

# Load it
with open(CONFIG_PATH, 'r') as file:
    config = yaml.safe_load(file)


In [5]:
ffv_merged = config['output']['ffv_merged_csv']
Tc_cleaned = config['output']['Tc_csv']
Tg_cleaned = config['output']['Tg_csv']


## Tokeniser

SMILES are not like natural language — special characters (=, #, Cl, Br, (), []) carry precise chemical meaning. So, character-based or chem-aware tokenizers are used instead of NLP-style ones.

# SELFIES (SELF-referencIng Embedded Strings)

In [6]:
import selfies as sf

In [6]:

def smiles_to_selfies(smiles):
    try:
        return sf.encoder(smiles)
    except:
        return None  # Invalid SMILES or conversion failure

In [7]:
def tokenize_selfies(selfie_str):
    if selfie_str is None:
        return ['[UNK]']
    return sf.split_selfies(selfie_str)


In [8]:
def build_selfies_vocab(smiles_list):
    tokens = set()
    for smi in smiles_list:
        selfie = smiles_to_selfies(smi)
        tokens.update(tokenize_selfies(selfie))
    tokens = sorted(list(tokens))
    token_to_idx = {tok: i+2 for i, tok in enumerate(tokens)}
    token_to_idx['<PAD>'] = 0
    token_to_idx['<UNK>'] = 1
    return token_to_idx


In [9]:
def encode_selfies(smi, token_to_idx, max_len=150):
    selfie = smiles_to_selfies(smi)
    tokens = tokenize_selfies(selfie)
    indices = [token_to_idx.get(tok, token_to_idx['<UNK>']) for tok in tokens]

    # Pad or truncate
    if len(indices) < max_len:
        indices += [token_to_idx['<PAD>']] * (max_len - len(indices))
    else:
        indices = indices[:max_len]
    return indices


In [10]:
import torch

def selfies_list_to_tensor(smiles_list, token_to_idx, max_len=150):
    encoded = [encode_selfies(smi, token_to_idx, max_len) for smi in smiles_list]
    return torch.tensor(encoded, dtype=torch.long)


In [11]:
ffv_merged_df = pd.read_csv(ffv_merged)



In [12]:
import pandas as pd

ffv_df = ffv_merged_df.copy()  # or whatever your DataFrame is named
ffv_df['SMILES'] = ffv_df['SMILES'].str.strip()  # clean whitespace
smiles_list = ffv_df['SMILES'].tolist()


In [13]:
token_to_idx = build_selfies_vocab(smiles_list)
padded_selfies_tensor = selfies_list_to_tensor(smiles_list, token_to_idx, max_len=150)


In [1]:
padded_selfies_tensor[118]

NameError: name 'padded_selfies_tensor' is not defined

In [15]:
selfies = config['output']['seflies']

In [31]:
for smi in smiles_list[5:10]:
    try:
        selfies_str = sf.encoder(smi)
        print("SMILES:", smi, "→ SELFIES:", selfies_str)
    except Exception as e:
        print(f"Conversion failed for {smi}: {e}")


Conversion failed for *OC(=O)CCCCCCCCC(=O)OC1COC2C(*)COC12: failed to parse input
	SMILES: *OC(=O)CCCCCCCCC(=O)OC1COC2C(*)COC12
Conversion failed for *Nc1ccc([C@H]2[C@@H]3C[C@H]4C[C@@H](C3)C[C@@H]2C4)cc1N*: failed to parse input
	SMILES: *Nc1ccc([C@H]2[C@@H]3C[C@H]4C[C@@H](C3)C[C@@H]2C4)cc1N*
Conversion failed for *C(=O)Nc1ccc(Oc2ccc(Oc3ccc(NC(=O)c4ccc5c(c4)C(=O)N(c4ccc(Oc6ccc(C7(c8ccc(Oc9ccc(N%10C(=O)c%11ccc(*)cc%11C%10=O)cc9)cc8)CC8CC7C7CCCC87)cc6)cc4)C5=O)cc3)c(C(C)(C)C)c2)cc1: failed to parse input
	SMILES: *C(=O)Nc1ccc(Oc2ccc(Oc3ccc(NC(=O)c4ccc5c(c4)C(=O)N(c4ccc(Oc6ccc(C7(c8ccc(Oc9ccc(N%10C(=O)c%11ccc(*)cc%11C%10=O)cc9)cc8)CC8CC7C7CCCC87)cc6)cc4)C5=O)cc3)c(C(C)(C)C)c2)cc1
Conversion failed for *CC(*)(C)C(=O)OCCCCCCCCCOc1ccc2cc(C(=O)Oc3ccccc3)ccc2c1: failed to parse input
	SMILES: *CC(*)(C)C(=O)OCCCCCCCCCOc1ccc2cc(C(=O)Oc3ccccc3)ccc2c1
Conversion failed for *Nc1ccc(-c2ccc(-c3ccc(N*)cc3)cc2)cc1: failed to parse input
	SMILES: *Nc1ccc(-c2ccc(-c3ccc(N*)cc3)cc2)cc1


In [None]:
from torch.utils.data import Dataset

class RdkitFingerprintDataset(Dataset):
    def __init__(self, pt_path):
        data = torch.load(pt_path)
        self.fingerprints = data["fingerprints"]
        self.smiles = data["smiles"]
        self.ids = data["id"]

    def __len__(self):
        return len(self.fingerprints)

    def __getitem__(self, idx):
        return {
            "fingerprint": self.fingerprints[idx],
            "smiles": self.smiles[idx],
            "id": self.ids[idx]
        }

# Usage:
dataset = RdkitFingerprintDataset("rdkit_ffv.pt")
print(dataset[0])


In [16]:
# Save both tensor and vocab in one file
torch.save({
    'input_tensor': padded_selfies_tensor,  # shape: [N, 150]
    'token_to_idx': token_to_idx,           # dict mapping SELFIES tokens to IDs
    'smiles': smiles_list                   # to track or map back
}, "selfies.pt")


## LSTM Encoder