In [26]:
import deepchem as dc
import pandas as pd
import numpy as np
from rdkit import Chem
#i = 0
full = pd.read_csv("../DataSets/Full_Free_Solv/full.csv", delimiter=";")
# freesolv
freesoldf = pd.read_csv("../DataSets/freesolv.csv")
freesoldf = freesoldf.drop("Unnamed: 0", axis=1)
# MNSol
mnsoldf = pd.read_csv("../DataSets/mnsol.csv", sep = ";")
mnsoldf_water = mnsoldf.loc[mnsoldf['Solvent'] == 'water']
mnsoldf = mnsoldf_water.reset_index().drop("index", axis = 1)
# CombiSolv-EXP
combisolv = pd.read_csv("../DataSets/CombiSolv-Exp-8780.csv", sep = ",")

## Combisolv dataset preparation and cleaning..
combisolv['SolventSMILES']= combisolv['ssid'].apply(lambda x: x.split(".")[0])
combisolv['SoluteSMILES']= combisolv['ssid'].apply(lambda x: x.split(".")[1])
combisolv['Solute'] = "solute"
combisolv['Solvent'] = "solvent"
combisolv.rename(columns = {'dgsolv':'DeltaGsolv'}, inplace = True)
combisolv.drop(columns=['ssid'], inplace = True)
combisolv = combisolv.loc[combisolv['SolventSMILES'] == "O"]
combisolv.reset_index().drop("index", axis = 1, inplace = True)
combisolv = combisolv[['Solute', 'Solvent', 'SoluteSMILES',
                        'SolventSMILES', 'DeltaGsolv']]

## Adding cannonical smiles for comparison..
combisolv['cannon_smiles'] = combisolv['SoluteSMILES'].apply(lambda x : Chem.CanonSmiles(x))
mnsoldf['cannon_smiles'] = mnsoldf['SoluteSMILES'].apply(lambda x : Chem.CanonSmiles(x))
freesoldf['cannon_smiles'] = freesoldf['SoluteSMILES'].apply(lambda x : Chem.CanonSmiles(x))
full['cannon_smiles'] = full['SoluteSMILES'].apply(lambda x : Chem.CanonSmiles(x))




In [27]:
matching_indices_combisolv = combisolv.index[combisolv['cannon_smiles'].isin(full['cannon_smiles'])].tolist()
matching_indices_full = full.index[full['cannon_smiles'].isin(combisolv['cannon_smiles'])].tolist()

print("Indices in combisolv:", matching_indices_combisolv)
print("Indices in full:", matching_indices_full)

Indices in combisolv: [7, 8, 31, 36, 45, 53, 83, 100, 104, 109, 124, 133, 134, 137, 142, 162, 165, 208, 214, 216, 232, 236, 242, 243, 248, 249, 255, 258, 263, 264, 275, 279, 282, 290, 291, 298, 299, 303, 308, 330, 335, 348, 349, 354, 355, 357, 362, 382, 383, 394, 399, 410, 411, 412, 422, 437, 441, 442, 452, 460, 469, 470, 471, 480, 481, 503, 514, 539, 553, 557, 565, 568, 573, 578, 584, 588, 591, 593, 603, 630, 635, 641, 657, 663, 670, 675, 677, 693, 704, 706, 709, 717, 719, 735, 737, 744, 747, 752, 763, 772, 778, 783, 786, 788, 797, 846, 851, 861, 866, 867, 877, 885, 897, 911, 915, 941, 950, 957, 960, 962, 970, 985, 988, 989, 992, 993, 999, 1002, 1010, 1012, 1022, 1024, 1026, 1029, 1052, 1063, 1070, 1080, 1099, 1109, 1112, 1114, 1117, 1118, 1122, 1127, 1138, 1142, 1158, 1164, 1174, 1176, 1190, 1192, 1214, 1220, 1223, 1228, 1235, 1240, 1241, 1250, 1275, 1286, 1287, 1292, 1294, 1297, 1298, 1302, 1306, 1308, 1309, 1316, 1318, 1330, 1338, 1339, 1344, 1360, 1371, 1377, 1380, 1393, 1404, 141

In [29]:
#len(matching_indices_full)

In [44]:
1153 + 104 + 76

1333

In [30]:
full_excluded = full.drop(index=matching_indices_full).reset_index(drop=True)

In [39]:
matching_indices_freesoldf = freesoldf.index[freesoldf['cannon_smiles'].isin(full_excluded['cannon_smiles'])].tolist()
matching_indices_full = full_excluded.index[full_excluded['cannon_smiles'].isin(freesoldf['cannon_smiles'])].tolist()

In [41]:
np.savetxt("freesolv_comm.txt", matching_indices_freesoldf, delimiter= "\n")

In [42]:
full_excluded = full_excluded.drop(index=matching_indices_full).reset_index(drop=True)

In [45]:
matching_indices_mnsoldf = mnsoldf.index[mnsoldf['cannon_smiles'].isin(full_excluded['cannon_smiles'])].tolist()
matching_indices_full = full_excluded.index[full_excluded['cannon_smiles'].isin(mnsoldf['cannon_smiles'])].tolist()

In [50]:
pwd

'/home/dm/Dibyendu/Projects/Property_Prediction/DeltaG_Prediction/NoteBooks'

In [47]:
np.savetxt("mnsol_comm.txt", matching_indices_mnsoldf, delimiter= "\n")

In [6]:
import concurrent.futures
import random
import time
import numpy as np
from deepchem.feat import PubChemFingerprint
from tqdm import tqdm  # For progress bars

def return_pubchem(smiles, delay_range=(1, 3)):
    """Featurize SMILES with PubChem fingerprints, with random delay.

    Args:
        smiles: SMILES string
        delay_range: Tuple of (min, max) delay in seconds

    Returns:
        np.ndarray: Fingerprint features or None if failed
    """
    time.sleep(random.uniform(*delay_range))  # Random delay

    try:
        featurizer = PubChemFingerprint()
        features = featurizer.featurize(smiles)
        return features[0]  # Return first result
    except Exception as e:
        print(f"Error processing {smiles[:20]}...: {str(e)}")
        return None

def parallel_pubchem_featurization(smiles_list, max_workers=20, batch_size=10, max_retries=3):
    """Parallel PubChem featurization with rate limiting and retries.

    Args:
        smiles_list: List of SMILES strings
        max_workers: Number of parallel threads (recommend 2-4 for PubChem)
        batch_size: Process in batches to manage memory
        max_retries: Maximum retry attempts per SMILES

    Returns:
        Tuple: (success_count, failed_count, results_array)
    """
    results = []
    failed_count = 0

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Process in batches with progress bar
        for i in tqdm(range(0, len(smiles_list), batch_size),
                     desc="Processing SMILES"):
            batch = smiles_list[i:i + batch_size]
            retry_batch = batch.copy()

            for attempt in range(max_retries):
                # Submit current batch
                futures = {executor.submit(return_pubchem, smi): smi
                          for smi in retry_batch}
                current_results = []
                new_retry = []

                # Process completed futures
                for future in concurrent.futures.as_completed(futures):
                    smi = futures[future]
                    try:
                        result = future.result()
                        if result is not None:
                            current_results.append(result)
                        else:
                            new_retry.append(smi)
                    except Exception:
                        new_retry.append(smi)

                results.extend(current_results)
                retry_batch = new_retry

                if not retry_batch:
                    break  # All succeeded

                if attempt < max_retries - 1:
                    print(f"Retrying {len(retry_batch)} failed SMILES (attempt {attempt + 1})")

            failed_count += len(retry_batch)

    success_count = len(results)
    print(f"Completed: {success_count} successful, {failed_count} failed")

    # Convert to numpy array with proper shape
    if results:
        return success_count, failed_count, results
    else:
        return 0, len(smiles_list), np.zeros((0, 881))  # Empty array

# Usage Example:
if __name__ == "__main__":
    smiles_examples = full['SoluteSMILES']

    success, failed, features = parallel_pubchem_featurization(
        smiles_examples,
        max_workers=20,  # Conservative for PubChem  # Random delay between 1-3 seconds
    )

    print(f"Feature matrix shape: {features.shape}")

Processing SMILES:   8%|▊         | 11/134 [00:44<08:19,  4.06s/it]Failed to featurize datapoint 0, CC(C(=O)O)c1ccc2c(c1)[nH]c1ccc(Cl)cc12. Appending empty array
Exception message: 'PUGREST.ServerBusy'
Processing SMILES:   9%|▉         | 12/134 [00:48<08:11,  4.03s/it]Failed to featurize datapoint 0, CCCCCCCOC(=O)c1ccccc1C(=O)OCCCCCCC. Appending empty array
Exception message: 'PUGREST.ServerBusy'
Processing SMILES:  14%|█▍        | 19/134 [01:16<07:48,  4.07s/it]Failed to featurize datapoint 0, C=C(C)C(=O)O. Appending empty array
Exception message: 'PUGREST.ServerBusy'
Processing SMILES:  23%|██▎       | 31/134 [02:06<07:01,  4.09s/it]Failed to featurize datapoint 0, O=C(O)CCCc1ccccc1. Appending empty array
Exception message: 'PUGREST.ServerBusy'
Processing SMILES:  27%|██▋       | 36/134 [02:26<06:38,  4.06s/it]Failed to featurize datapoint 0, CCCCCOC(=O)CCC. Appending empty array
Exception message: 'PUGREST.ServerBusy'
Processing SMILES:  31%|███       | 41/134 [02:47<06:14,  4.03s/i

Completed: 1333 successful, 0 failed





AttributeError: 'list' object has no attribute 'shape'

In [7]:
features

[array([1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 