In [1]:
import os
%pwd

'/mnt/cb03386d-9344-47b1-82f9-868fbb64b4ae/python_projects/HIV_inhibitors_classification_and_generation/research'

In [2]:
os.chdir("../")
%pwd

'/mnt/cb03386d-9344-47b1-82f9-868fbb64b4ae/python_projects/HIV_inhibitors_classification_and_generation'

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_csv: Path
    train_csv: Path
    test_csv: Path
    train_folder: Path
    test_folder: Path
    params: dict
    dataset_val_status: bool

In [4]:
from hivclass.constants import *
from hivclass.utils.main_utils import create_directories, read_yaml

class ConfigurationManager:
    def __init__(
        self,
        config_file_path = CONFIG_FILE_PATH,
        params_file_path = PARAMS_FILE_PATH,
        schema_file_path = SCHEMA_FILE_PATH
    ):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        self.schema = read_yaml(schema_file_path)
        
        create_directories([self.config.artifacts_root])
    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        params = self.params.data_transformation
        dataset_val_status_file = self.config.data_validation.STATUS_FILE
        
        with open(dataset_val_status_file, 'r') as f:
            status = f.read()
        
        status = bool(str.split(status)[-1])
        
        create_directories([config.root_dir, config.train_folder, config.test_folder])
        
        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_csv=config.data_csv,
            train_csv=config.train_csv,
            test_csv=config.test_csv,
            train_folder=config.train_folder,
            test_folder=config.test_folder,
            params=params.split_size,
            dataset_val_status=status
        )
        
        return data_transformation_config

In [9]:
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
import random
import numpy as np
from rdkit import Chem
import torch
from torch_geometric.data import Data
import deepchem as dc

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
    
    def test_split_train_balanced(self):
        df = pd.read_csv(self.config.data_csv)
        
        # Separate positive and negative cases
        p_val = df[df.HIV_active == 1].to_numpy()
        n_val = df[df.HIV_active == 0].to_numpy()

        # Ensure class balance by selecting the smaller group as the target
        if len(p_val) >= len(n_val):
            big, small = p_val, n_val
        else:
            big, small = n_val, p_val

        # Stratified test split
        small_train, small_test = train_test_split(
            small,
            test_size=self.config.params,
            random_state=42
        )
        
        big_train, big_test = train_test_split(
            big,
            test_size=(self.config.params * len(small) / len(big)),
            random_state=42
        )

        test = np.concatenate([small_test, big_test])
        
        # Ensure the train set remains balanced by oversampling the smaller class
        train = np.concatenate([
            big_train,
            random.choices(small_train, k=len(big_train) - len(small_train))
        ])

        # Convert back to DataFrame
        train_df = pd.DataFrame(train, columns=df.columns).sample(frac=1, random_state=42)
        train_df.reset_index(drop=True, inplace=True)
        # train_name_column = [f'train_{i+1}' for i in train_df.index]
        # train_df.insert(0, 'name', train_name_column)
        train_df.insert(0, 'name', 'train')
        # train_df.to_csv(self.config.train_csv, index=False)
        
        test_df = pd.DataFrame(test, columns=df.columns).sample(frac=1, random_state=42)
        test_df.reset_index(drop=True, inplace=True)
        # test_name_column = [f'test{i+1}' for i in train_df.index]
        # test_df.insert(0, 'name', test_name_column)
        test_df.insert(0, 'name', 'test')
        # test_df.to_csv(self.config.test_csv, index=False)
        
        return [train_df, test_df]
    
    def data_preparation(self, dfs):
        device = "cuda" if torch.cuda.is_available() else "cpu"

        datas = dfs
        save_dirs = [self.config.train_folder, self.config.test_folder]
        featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)

        all_deepchem_data = []

        for data, save_dir in zip(datas, save_dirs):
            # data_df = pd.read_csv(data)
            # name = os.path.splitext(os.path.basename(data))[0]  # Extract filename without extension
            name = data.name[0]
            i = 0
            data_df = pd.DataFrame(columns=['name', 'smiles', 'HIV_active'])
            
            for mol in tqdm(data.itertuples(index=True), total=len(data), desc=f"Processing {name}"):

                mol_obj = Chem.MolFromSmiles(mol.smiles)
                if mol_obj is None:
                    continue  # Skip invalid SMILES strings
                
                label = torch.tensor(mol.HIV_active, dtype=torch.int64, device=device)
                
                graph_features = featurizer._featurize(mol_obj)

                data_deepchem = Data(
                    x=torch.tensor(graph_features.node_features, dtype=torch.float, device=device),
                    edge_attr=torch.tensor(graph_features.edge_features, dtype=torch.float, device=device),
                    edge_index=torch.tensor(graph_features.edge_index, dtype=torch.long, device=device),
                    y=label, smiles=mol.smiles
                )
                all_deepchem_data.append(data_deepchem)

                # Save preprocessed molecule graph
                filename = os.path.join(save_dir, f"{name}_{i}.pt")
                torch.save(data_deepchem, filename)
                
                if os.path.exists(filename):
                    mol_graph = torch.load(filename, weights_only=False)
                    
                    if mol_graph.y.item() != None:
                        data_df.loc[i] = [f"{name}_{i}.pt", mol_graph.smiles, mol_graph.y.item()]
                        i += 1
            
            data_df.to_csv(os.path.join(save_dir, f"{name}.csv"), index=False)
    
    def transformation_compose(self):
        dfs = self.test_split_train_balanced()
        self.data_preparation(dfs)

In [10]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    results_rdkit, results_deepchem = data_transformation.transformation_compose()

except Exception as e:
    raise e

[2025-04-07 19:28:50,059: INFO: main_utils: created directory at: artifacts]
[2025-04-07 19:28:50,061: INFO: main_utils: created directory at: artifacts/data_transformation]
[2025-04-07 19:28:50,061: INFO: main_utils: created directory at: artifacts/data_transformation/train]
[2025-04-07 19:28:50,062: INFO: main_utils: created directory at: artifacts/data_transformation/test]


Processing train:   0%|          | 0/77636 [00:00<?, ?it/s]

[2025-04-07 19:28:50,287: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 4/77636 [00:00<37:36, 34.40it/s]

[2025-04-07 19:28:50,364: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 19/77636 [00:00<20:29, 63.14it/s]

[2025-04-07 19:28:50,572: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 26/77636 [00:00<21:51, 59.19it/s]

[2025-04-07 19:28:50,815: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:28:50,842: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 32/77636 [00:00<29:35, 43.70it/s]

[2025-04-07 19:28:50,856: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 41/77636 [00:00<23:20, 55.39it/s]

[2025-04-07 19:28:51,058: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 48/77636 [00:00<23:11, 55.75it/s]

[2025-04-07 19:28:51,162: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 62/77636 [00:01<24:24, 52.99it/s]

[2025-04-07 19:28:51,429: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 75/77636 [00:01<23:23, 55.25it/s]

[2025-04-07 19:28:51,699: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 81/77636 [00:01<25:52, 49.95it/s]

[2025-04-07 19:28:51,761: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 87/77636 [00:01<25:09, 51.36it/s]

[2025-04-07 19:28:51,866: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 93/77636 [00:01<24:58, 51.76it/s]

[2025-04-07 19:28:51,978: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 108/77636 [00:01<21:24, 60.33it/s]

[2025-04-07 19:28:52,295: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 115/77636 [00:02<22:43, 56.85it/s]

[2025-04-07 19:28:52,386: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 123/77636 [00:02<20:59, 61.54it/s]

[2025-04-07 19:28:52,471: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 130/77636 [00:02<20:55, 61.73it/s]

[2025-04-07 19:28:52,564: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:28:52,618: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:28:52,723: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 137/77636 [00:02<25:56, 49.78it/s]

[2025-04-07 19:28:52,797: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 150/77636 [00:02<25:26, 50.77it/s]

[2025-04-07 19:28:53,058: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 163/77636 [00:03<22:48, 56.61it/s]

[2025-04-07 19:28:53,260: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 171/77636 [00:03<21:00, 61.45it/s]

[2025-04-07 19:28:53,486: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 187/77636 [00:03<22:28, 57.42it/s]

[2025-04-07 19:28:53,749: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 194/77636 [00:03<22:58, 56.18it/s]

[2025-04-07 19:28:53,816: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 222/77636 [00:04<24:23, 52.90it/s]

[2025-04-07 19:28:54,333: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 230/77636 [00:04<22:08, 58.26it/s]

[2025-04-07 19:28:54,411: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 264/77636 [00:04<18:03, 71.38it/s]

[2025-04-07 19:28:54,876: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 294/77636 [00:05<21:57, 58.69it/s]

[2025-04-07 19:28:55,477: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 301/77636 [00:05<22:37, 56.99it/s]

[2025-04-07 19:28:55,688: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 308/77636 [00:05<27:00, 47.73it/s]

[2025-04-07 19:28:55,755: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 314/77636 [00:05<25:49, 49.90it/s]

[2025-04-07 19:28:55,916: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 337/77636 [00:06<21:18, 60.47it/s]

[2025-04-07 19:28:56,204: INFO: molecule_feature_utils: input SP3D2 not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:28:56,329: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 350/77636 [00:06<24:00, 53.64it/s]

[2025-04-07 19:28:56,607: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   0%|          | 376/77636 [00:06<19:55, 64.61it/s]

[2025-04-07 19:28:56,913: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 390/77636 [00:06<20:24, 63.06it/s]

[2025-04-07 19:28:57,236: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 412/77636 [00:07<17:27, 73.74it/s]

[2025-04-07 19:28:57,447: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 473/77636 [00:07<17:02, 75.49it/s]

[2025-04-07 19:28:58,230: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:28:58,231: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 481/77636 [00:08<20:30, 62.73it/s]

[2025-04-07 19:28:58,375: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:28:58,443: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 505/77636 [00:08<20:45, 61.94it/s]

[2025-04-07 19:28:58,848: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 518/77636 [00:08<23:41, 54.26it/s]

[2025-04-07 19:28:59,046: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:28:59,107: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 547/77636 [00:09<19:49, 64.82it/s]

[2025-04-07 19:28:59,533: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 556/77636 [00:09<17:57, 71.52it/s]

[2025-04-07 19:28:59,644: INFO: molecule_feature_utils: input SP3D2 not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 567/77636 [00:09<15:46, 81.46it/s]

[2025-04-07 19:28:59,741: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 584/77636 [00:09<17:32, 73.18it/s]

[2025-04-07 19:28:59,960: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:00,033: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 623/77636 [00:10<20:55, 61.36it/s]

[2025-04-07 19:29:00,645: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 654/77636 [00:10<19:00, 67.49it/s]

[2025-04-07 19:29:01,049: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 668/77636 [00:11<20:10, 63.57it/s]

[2025-04-07 19:29:01,319: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 676/77636 [00:11<19:17, 66.47it/s]

[2025-04-07 19:29:01,472: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 683/77636 [00:11<21:08, 60.65it/s]

[2025-04-07 19:29:01,591: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:01,594: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 794/77636 [00:13<20:54, 61.26it/s]

[2025-04-07 19:29:03,566: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 801/77636 [00:13<22:30, 56.91it/s]

[2025-04-07 19:29:03,739: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 808/77636 [00:13<21:40, 59.05it/s]

[2025-04-07 19:29:03,780: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 815/77636 [00:13<24:52, 51.46it/s]

[2025-04-07 19:29:03,996: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 821/77636 [00:13<24:06, 53.11it/s]

[2025-04-07 19:29:04,104: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 827/77636 [00:14<31:03, 41.22it/s]

[2025-04-07 19:29:04,369: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 845/77636 [00:14<22:35, 56.66it/s]

[2025-04-07 19:29:04,583: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:04,585: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:04,586: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:04,588: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:04,590: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 855/77636 [00:14<19:41, 64.98it/s]

[2025-04-07 19:29:04,676: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 863/77636 [00:14<19:46, 64.68it/s]

[2025-04-07 19:29:04,740: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:04,794: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:04,921: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 870/77636 [00:14<24:34, 52.08it/s]

[2025-04-07 19:29:04,985: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 899/77636 [00:15<23:04, 55.41it/s]

[2025-04-07 19:29:05,564: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 908/77636 [00:15<22:38, 56.48it/s]

[2025-04-07 19:29:05,610: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 914/77636 [00:15<23:19, 54.82it/s]

[2025-04-07 19:29:05,761: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|          | 948/77636 [00:16<20:01, 63.83it/s]

[2025-04-07 19:29:06,224: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|▏         | 984/77636 [00:16<20:10, 63.35it/s]

[2025-04-07 19:29:06,819: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:06,821: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:06,850: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:06,851: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|▏         | 1029/77636 [00:17<17:06, 74.61it/s]

[2025-04-07 19:29:07,583: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|▏         | 1047/77636 [00:17<18:33, 68.81it/s]

[2025-04-07 19:29:07,806: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|▏         | 1056/77636 [00:17<17:45, 71.85it/s]

[2025-04-07 19:29:07,897: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:07,947: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|▏         | 1064/77636 [00:17<19:31, 65.38it/s]

[2025-04-07 19:29:08,031: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:08,033: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:08,034: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|▏         | 1078/77636 [00:18<21:39, 58.91it/s]

[2025-04-07 19:29:08,299: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|▏         | 1088/77636 [00:18<18:33, 68.74it/s]

[2025-04-07 19:29:08,389: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|▏         | 1096/77636 [00:18<18:49, 67.74it/s]

[2025-04-07 19:29:08,570: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|▏         | 1123/77636 [00:18<21:34, 59.12it/s]

[2025-04-07 19:29:09,083: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|▏         | 1140/77636 [00:19<20:05, 63.47it/s]

[2025-04-07 19:29:09,335: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|▏         | 1147/77636 [00:19<21:42, 58.73it/s]

[2025-04-07 19:29:09,470: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|▏         | 1154/77636 [00:19<21:43, 58.69it/s]

[2025-04-07 19:29:09,594: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   1%|▏         | 1161/77636 [00:19<21:35, 59.03it/s]

[2025-04-07 19:29:09,669: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:09,670: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:09,672: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1183/77636 [00:19<20:05, 63.42it/s]

[2025-04-07 19:29:09,938: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:10,086: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1198/77636 [00:20<21:26, 59.41it/s]

[2025-04-07 19:29:10,243: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:10,318: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1212/77636 [00:20<22:36, 56.32it/s]

[2025-04-07 19:29:10,581: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1218/77636 [00:20<24:58, 51.01it/s]

[2025-04-07 19:29:10,717: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1224/77636 [00:20<24:59, 50.97it/s]

[2025-04-07 19:29:10,759: INFO: molecule_feature_utils: input SP3D2 not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:10,760: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1259/77636 [00:21<18:14, 69.78it/s]

[2025-04-07 19:29:11,310: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1295/77636 [00:21<27:05, 46.95it/s]

[2025-04-07 19:29:12,093: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1301/77636 [00:21<25:52, 49.18it/s]

[2025-04-07 19:29:12,245: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1307/77636 [00:22<26:58, 47.16it/s]

[2025-04-07 19:29:12,382: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1333/77636 [00:22<18:25, 69.01it/s]

[2025-04-07 19:29:12,661: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:12,686: INFO: molecule_feature_utils: input SP3D2 not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1341/77636 [00:22<19:54, 63.86it/s]

[2025-04-07 19:29:12,813: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1350/77636 [00:22<18:24, 69.09it/s]

[2025-04-07 19:29:12,959: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:13,074: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1358/77636 [00:22<22:40, 56.08it/s]

[2025-04-07 19:29:13,264: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1365/77636 [00:23<25:49, 49.21it/s]

[2025-04-07 19:29:13,294: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1382/77636 [00:23<21:34, 58.91it/s]

[2025-04-07 19:29:13,545: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1389/77636 [00:23<22:50, 55.64it/s]

[2025-04-07 19:29:13,699: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1396/77636 [00:23<21:40, 58.63it/s]

[2025-04-07 19:29:13,874: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1473/77636 [00:24<19:11, 66.14it/s]

[2025-04-07 19:29:14,848: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:14,850: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:14,851: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:14,854: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1497/77636 [00:24<17:46, 71.40it/s]

[2025-04-07 19:29:15,131: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1506/77636 [00:25<16:41, 76.05it/s]

[2025-04-07 19:29:15,293: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1514/77636 [00:25<17:15, 73.54it/s]

[2025-04-07 19:29:15,430: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1530/77636 [00:25<21:25, 59.19it/s]

[2025-04-07 19:29:15,737: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:15,776: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:15,823: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:15,861: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1544/77636 [00:25<23:20, 54.34it/s]

[2025-04-07 19:29:16,022: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:16,023: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1551/77636 [00:25<22:50, 55.52it/s]

[2025-04-07 19:29:16,181: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1565/77636 [00:26<22:20, 56.73it/s]

[2025-04-07 19:29:16,465: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:16,507: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1579/77636 [00:26<23:39, 53.58it/s]

[2025-04-07 19:29:16,636: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1594/77636 [00:26<22:32, 56.23it/s]

[2025-04-07 19:29:17,002: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1609/77636 [00:27<23:13, 54.55it/s]

[2025-04-07 19:29:17,344: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1615/77636 [00:27<25:33, 49.58it/s]

[2025-04-07 19:29:17,492: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1661/77636 [00:27<18:27, 68.61it/s]

[2025-04-07 19:29:18,135: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1669/77636 [00:28<22:11, 57.05it/s]

[2025-04-07 19:29:18,335: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1684/77636 [00:28<19:53, 63.63it/s]

[2025-04-07 19:29:18,540: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1691/77636 [00:28<20:40, 61.22it/s]

[2025-04-07 19:29:18,695: INFO: molecule_feature_utils: input SP3D2 not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1705/77636 [00:28<19:53, 63.61it/s]

[2025-04-07 19:29:18,850: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:18,970: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:18,985: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:18,986: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1712/77636 [00:28<24:00, 52.72it/s]

[2025-04-07 19:29:19,086: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1729/77636 [00:29<19:35, 64.56it/s]

[2025-04-07 19:29:19,220: INFO: molecule_feature_utils: input SP3D2 not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1755/77636 [00:29<16:48, 75.21it/s]

[2025-04-07 19:29:19,612: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:19,675: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1797/77636 [00:29<16:16, 77.64it/s]

[2025-04-07 19:29:20,150: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1841/77636 [00:30<16:05, 78.52it/s]

[2025-04-07 19:29:20,807: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:20,946: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1850/77636 [00:30<21:33, 58.60it/s]

[2025-04-07 19:29:21,058: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1891/77636 [00:31<18:40, 67.58it/s]

[2025-04-07 19:29:21,688: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1899/77636 [00:31<20:38, 61.17it/s]

[2025-04-07 19:29:21,831: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1906/77636 [00:31<21:21, 59.11it/s]

[2025-04-07 19:29:21,962: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1920/77636 [00:31<21:30, 58.67it/s]

[2025-04-07 19:29:22,175: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   2%|▏         | 1935/77636 [00:32<19:41, 64.08it/s]

[2025-04-07 19:29:22,406: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 1944/77636 [00:32<17:48, 70.83it/s]

[2025-04-07 19:29:22,546: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:22,605: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 1952/77636 [00:32<20:13, 62.36it/s]

[2025-04-07 19:29:22,759: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 1960/77636 [00:32<21:17, 59.23it/s]

[2025-04-07 19:29:22,782: INFO: molecule_feature_utils: input SP3D2 not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 1979/77636 [00:32<19:31, 64.59it/s]

[2025-04-07 19:29:23,117: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 1986/77636 [00:32<19:10, 65.77it/s]

[2025-04-07 19:29:23,146: INFO: molecule_feature_utils: input SP3D2 not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 1993/77636 [00:33<19:18, 65.27it/s]

[2025-04-07 19:29:23,315: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2014/77636 [00:33<20:10, 62.48it/s]

[2025-04-07 19:29:23,663: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:23,703: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2033/77636 [00:33<24:39, 51.10it/s]

[2025-04-07 19:29:24,045: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2055/77636 [00:34<22:50, 55.13it/s]

[2025-04-07 19:29:24,450: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2080/77636 [00:34<17:57, 70.11it/s]

[2025-04-07 19:29:24,889: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2088/77636 [00:34<21:01, 59.90it/s]

[2025-04-07 19:29:25,036: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2104/77636 [00:35<20:42, 60.81it/s]

[2025-04-07 19:29:25,242: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2128/77636 [00:35<18:44, 67.13it/s]

[2025-04-07 19:29:25,598: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:25,627: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:25,629: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2143/77636 [00:35<22:07, 56.87it/s]

[2025-04-07 19:29:25,995: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2159/77636 [00:35<22:57, 54.80it/s]

[2025-04-07 19:29:26,216: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2165/77636 [00:36<23:11, 54.22it/s]

[2025-04-07 19:29:26,417: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2186/77636 [00:36<22:12, 56.61it/s]

[2025-04-07 19:29:26,814: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2203/77636 [00:36<20:13, 62.16it/s]

[2025-04-07 19:29:27,019: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2238/77636 [00:37<16:45, 74.96it/s]

[2025-04-07 19:29:27,428: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2246/77636 [00:37<20:38, 60.88it/s]

[2025-04-07 19:29:27,672: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2292/77636 [00:38<24:05, 52.12it/s]

[2025-04-07 19:29:28,780: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:28,844: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2309/77636 [00:38<20:45, 60.49it/s]

[2025-04-07 19:29:29,016: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2316/77636 [00:38<20:51, 60.19it/s]

[2025-04-07 19:29:29,628: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2323/77636 [00:39<43:38, 28.76it/s]

[2025-04-07 19:29:29,977: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2331/77636 [00:39<46:07, 27.21it/s]

[2025-04-07 19:29:30,036: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:30,138: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2352/77636 [00:40<30:51, 40.66it/s]

[2025-04-07 19:29:30,397: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2398/77636 [00:40<17:53, 70.10it/s]

[2025-04-07 19:29:31,012: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2482/77636 [00:42<18:42, 66.94it/s]

[2025-04-07 19:29:32,375: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2489/77636 [00:42<20:05, 62.35it/s]

[2025-04-07 19:29:32,473: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:32,572: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2511/77636 [00:42<19:42, 63.52it/s]

[2025-04-07 19:29:32,835: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2537/77636 [00:43<27:41, 45.21it/s]

[2025-04-07 19:29:33,493: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2574/77636 [00:43<22:28, 55.68it/s]

[2025-04-07 19:29:34,077: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2598/77636 [00:44<25:00, 49.99it/s]

[2025-04-07 19:29:34,688: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2604/77636 [00:44<27:57, 44.73it/s]

[2025-04-07 19:29:34,768: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2631/77636 [00:44<18:07, 68.94it/s]

[2025-04-07 19:29:35,207: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2666/77636 [00:45<18:49, 66.37it/s]

[2025-04-07 19:29:35,744: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2707/77636 [00:46<19:22, 64.47it/s]

[2025-04-07 19:29:36,397: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   3%|▎         | 2715/77636 [00:46<18:34, 67.22it/s]

[2025-04-07 19:29:36,593: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   4%|▎         | 2729/77636 [00:46<21:27, 58.17it/s]

[2025-04-07 19:29:36,822: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   4%|▎         | 2745/77636 [00:46<20:06, 62.06it/s]

[2025-04-07 19:29:36,961: INFO: molecule_feature_utils: input SP3D not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:37,101: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   4%|▎         | 2758/77636 [00:47<21:57, 56.84it/s]

[2025-04-07 19:29:37,325: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   4%|▎         | 2772/77636 [00:47<20:35, 60.62it/s]

[2025-04-07 19:29:37,496: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   4%|▎         | 2793/77636 [00:47<22:56, 54.37it/s]

[2025-04-07 19:29:37,980: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   4%|▎         | 2801/77636 [00:47<22:37, 55.12it/s]

[2025-04-07 19:29:38,046: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:38,137: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   4%|▎         | 2822/77636 [00:48<20:50, 59.83it/s]

[2025-04-07 19:29:38,413: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:38,415: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:38,416: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   4%|▎         | 2879/77636 [00:49<22:35, 55.17it/s]

[2025-04-07 19:29:39,578: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   4%|▎         | 2886/77636 [00:49<23:05, 53.96it/s]

[2025-04-07 19:29:39,651: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   4%|▎         | 2899/77636 [00:49<23:40, 52.62it/s]

[2025-04-07 19:29:39,845: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   4%|▍         | 2921/77636 [00:49<19:55, 62.48it/s]

[2025-04-07 19:29:40,171: INFO: molecule_feature_utils: input SP3D2 not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   4%|▍         | 2929/77636 [00:50<19:00, 65.49it/s]

[2025-04-07 19:29:40,313: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:40,405: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]
[2025-04-07 19:29:40,406: INFO: molecule_feature_utils: input UNSPECIFIED not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   4%|▍         | 2943/77636 [00:50<20:27, 60.84it/s]

[2025-04-07 19:29:40,563: INFO: molecule_feature_utils: input S not in allowable set ['SP', 'SP2', 'SP3']:]


Processing train:   4%|▍         | 2949/77636 [00:50<21:19, 58.38it/s]


KeyboardInterrupt: 