#### Generate SMILES using API: https://cactus.nci.nih.gov/chemical/structure

Import relavent libraries

In [2]:
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

In [3]:
def cas_to_smiles(cas_number):
    url = f"https://cactus.nci.nih.gov/chemical/structure/{cas_number}/smiles"
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        smiles = response.text.strip()
        return cas_number, smiles if smiles else None
    except requests.exceptions.RequestException:
        return cas_number, None

if __name__ == "__main__":
    # Load CSV
    df = pd.read_csv("C:/Users/suman/OneDrive/Bureau/Internship_Study/GNN_On_OdorPrediction/data/Handling_missing_data/Hierarchy_OdorCAS.csv")

    if "cas_number" not in df.columns:
        raise ValueError("Column 'cas_number' not found in the CSV file.")

    df = df.dropna(subset=["cas_number"])

    # Get CAS numbers
    unique_cas_numbers = df["cas_number"].unique()
    print("CAS numbers to process:", len(unique_cas_numbers))

    # Fetch SMILES in parallel
    print("Fetching SMILES using multithreading...")
    cas_to_smiles_dict = {}

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(cas_to_smiles, cas): cas for cas in unique_cas_numbers}
        for future in tqdm(as_completed(futures), total=len(futures)):
            cas, smiles = future.result()
            if smiles:  # Store only valid results
                cas_to_smiles_dict[cas] = smiles

    # Map SMILES to all rows (including duplicates)
    df["SMILES"] = df["cas_number"].map(cas_to_smiles_dict)

    # Reorder columns: CAS + SMILES first
    first_cols = ["cas_number", "SMILES"]
    other_cols = [col for col in df.columns if col not in first_cols]
    df = df[first_cols + other_cols]

    # Save result
    df.to_csv("Odor_SMILES.csv", index=False)


CAS numbers to process: 3839
Fetching SMILES using multithreading...


100%|██████████| 3839/3839 [02:44<00:00, 23.40it/s]


Since the SMILES generation may be interrupted due to API rate limits, we handle this by splitting the data. Initially, we generate SMILES for available CAS numbers and split the output into two parts: `Odor_SMILE_Part1` (with successfully generated SMILES) and `Part2` (with missing SMILES). We then retry `Part2` through the same process, producing a new file that we again split. The newly generated SMILES (`Part3`) are concatenated with `Part1`, and the cycle continues until the API stops returning results entirely.

#### **Split** CSV

In [None]:
# Load the CSV
df = pd.read_csv("/content/OdorSmiles_Updated.csv", dtype={'cas_number': str})

# Split row
split_row = # Until the SMILES have been fetched 

# Keep header + row 0 in both parts
df_part1 = df.iloc[:split_row]
df_part2 = pd.concat([df.iloc[[0]], df.iloc[split_row:]], ignore_index=True)

# Save the parts
df_part1.to_csv("Odor_SMILES_part1.csv", index=False)
df_part2.to_csv("Odor_SMILES_part2.csv", index=False)

print(f"CSV file split at row {split_row}, keeping the first row in both parts.")


#### Concate 2 CSV files

In [None]:
# Load both CSV files
df1 = pd.read_csv("Inital CSV with smiles")
df2 = pd.read_csv("newly generated csv from Odor_SMILES_part2.csv")

# Concatenate them as rows
merged_df = pd.concat([df1, df2], ignore_index=True)

# Save to a new file
merged_df.to_csv("Odor_SMILES_merged12.csv", index=False)

print("Files merged successfully into 'Odor_SMILES_merged.csv'")


More than 2000 SMILES were successfully retrieved from CAS numbers using the CACTUS API. For the remaining CAS values where CACTUS did not return results, we used the PubChemPy API to complete the SMILES extraction. This two-step approach ensured maximum coverage.

In [4]:
import pubchempy as pcp

list_cas=["16400-72-9",
"16409-43-1", 
"16409-45-3", 
"16409-46-4", 
"16423-19-1", 
"16424-55-8", 
"16429-21-3"]
for cas in list_cas :
    cids = pcp.get_cids(cas, namespace='name')
    lookup_result = pcp.get_properties(
    ['inchi', 'inchikey', 'canonical_smiles', 'isomeric_smiles', 'iupac_name'],
    cids)
    print(lookup_result);

[{'CID': 61834, 'CanonicalSMILES': 'CCCCCCCC1CC=CC(=O)O1', 'IsomericSMILES': 'CCCCCCCC1CC=CC(=O)O1', 'InChI': 'InChI=1S/C12H20O2/c1-2-3-4-5-6-8-11-9-7-10-12(13)14-11/h7,10-11H,2-6,8-9H2,1H3', 'InChIKey': 'XPTXKXKPWKNYKB-UHFFFAOYSA-N', 'IUPACName': '2-heptyl-2,3-dihydropyran-6-one'}]
[{'CID': 27866, 'CanonicalSMILES': 'CC1CCOC(C1)C=C(C)C', 'IsomericSMILES': 'CC1CCOC(C1)C=C(C)C', 'InChI': 'InChI=1S/C10H18O/c1-8(2)6-10-7-9(3)4-5-11-10/h6,9-10H,4-5,7H2,1-3H3', 'InChIKey': 'CZCBTSFUTPZVKJ-UHFFFAOYSA-N', 'IUPACName': '4-methyl-2-(2-methylprop-1-enyl)oxane'}]
[{'CID': 27867, 'CanonicalSMILES': 'CC1CCC(C(C1)OC(=O)C)C(C)C', 'IsomericSMILES': 'CC1CCC(C(C1)OC(=O)C)C(C)C', 'InChI': 'InChI=1S/C12H22O2/c1-8(2)11-6-5-9(3)7-12(11)14-10(4)13/h8-9,11-12H,5-7H2,1-4H3', 'InChIKey': 'XHXUANMFYXWVNG-UHFFFAOYSA-N', 'IUPACName': '(5-methyl-2-propan-2-ylcyclohexyl) acetate'}]
[{'CID': 565690, 'CanonicalSMILES': 'CC1CCC(C(C1)OC(=O)CC(C)C)C(C)C', 'IsomericSMILES': 'CC1CCC(C(C1)OC(=O)CC(C)C)C(C)C', 'InChI': 'InCh