#### Generate SMILES using APIs: 
CACTUS: https://cactus.nci.nih.gov/chemical/structure </br>
PubChemPy: https://pubchem.ncbi.nlm.nih.gov

Import relavent libraries

In [None]:
import requests
import pandas as pd
import time
import pubchempy as pcp
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

In [None]:
def cas_to_smiles(cas_number):
    url = f"https://cactus.nci.nih.gov/chemical/structure/{cas_number}/smiles"
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        smiles = response.text.strip()
        return cas_number, smiles if smiles else None
    except requests.exceptions.RequestException:
        return cas_number, None

if __name__ == "__main__":
    # Load CSV
    df = pd.read_csv("PreprocessData/rawData/SoSCAS_withoutSmiles.csv")

    if "cas_number" not in df.columns:
        raise ValueError("Column 'cas_number' not found in the CSV file.")

    df = df.dropna(subset=["cas_number"])

    # Get CAS numbers
    unique_cas_numbers = df["cas_number"].unique()
    print("CAS numbers to process:", len(unique_cas_numbers))

    # Fetch SMILES in parallel
    print("Fetching SMILES using multithreading...")
    cas_to_smiles_dict = {}

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(cas_to_smiles, cas): cas for cas in unique_cas_numbers}
        for future in tqdm(as_completed(futures), total=len(futures)):
            cas, smiles = future.result()
            if smiles:  # Store only valid results
                cas_to_smiles_dict[cas] = smiles

    # Map SMILES to all rows (including duplicates)
    df["SMILES"] = df["cas_number"].map(cas_to_smiles_dict)

    # Reorder columns: CAS + SMILES first
    first_cols = ["cas_number", "SMILES"]
    other_cols = [col for col in df.columns if col not in first_cols]
    df = df[first_cols + other_cols]

    # Save result
    df.to_csv("SoS_SMILES.csv", index=False)


CAS numbers to process: 3839
Fetching SMILES using multithreading...


100%|██████████| 3839/3839 [02:22<00:00, 26.95it/s]


Since the SMILES generation may be interrupted due to API rate limits, we handle this by splitting the data. Initially, we generate SMILES for available CAS numbers and split the output into two parts: `Odor_SMILE_Part1` (with successfully generated SMILES) and `Part2` (with missing SMILES). We then retry `Part2` through the same process, producing a new file that we again split. The newly generated SMILES (`Part3`) are concatenated with `Part1`, and the cycle continues until the API stops returning results entirely.

#### **Split** CSV

In [None]:
# Load the CSV
df = pd.read_csv("PreprocessData/rawData/SoS_SMILES.csv", dtype={'cas_number': str})

# Split row
split_row = # Until the SMILES have been fetched 

# Keep header + row 0 in both parts
df_part1 = df.iloc[:split_row]
df_part2 = pd.concat([df.iloc[[0]], df.iloc[split_row:]], ignore_index=True)

# Save the parts
df_part1.to_csv("Odor_SMILES_part1.csv", index=False)
df_part2.to_csv("Odor_SMILES_part2.csv", index=False)

print(f"CSV file split at row {split_row}, keeping the first row in both parts.")


This is an example code for how the process of generating the SMILES is carried out. The final csv file can be found as SoS_SMILES.csv

#### Concate 2 CSV files

In [None]:
# Load both CSV files
df1 = pd.read_csv("Inital CSV with smiles")
df2 = pd.read_csv("newly generated csv from Odor_SMILES_part2.csv")

# Concatenate them as rows
merged_df = pd.concat([df1, df2], ignore_index=True)

# Save to a new file
merged_df.to_csv("Odor_SMILES_merged12.csv", index=False)

print("Files merged successfully into 'Odor_SMILES_merged.csv'")


More than 2000 SMILES were successfully retrieved from CAS numbers using the CACTUS API. For the remaining CAS values where CACTUS did not return results, we used the PubChemPy API to complete the SMILES extraction. This two-step approach ensured maximum coverage.

### Fetch SMILES through PubChemPy 

In [None]:
# Load CSV
input_file = "PreprocessData/rawData/SoSCAS_withoutSmiles.csv"
output_file = "Odor_with_Smiles.csv"
df = pd.read_csv(input_file)

# Detect CAS column
cas_column = "cas_number" if "cas_number" in df.columns else df.columns[0]

# Cache
smiles_cache = {}

# Function to fetch SMILES
def fetch_smiles(cas):
    if cas in smiles_cache:
        return smiles_cache[cas]

    try:
        if not cas or cas.strip() == "0":
            smiles_cache[cas] = None
            return None

        cids = pcp.get_cids(cas, namespace='name')
        time.sleep(0.2)
        if not cids:
            print(f"CAS not found: {cas}")
            smiles_cache[cas] = None
            return None

        props = pcp.get_properties('SMILES', cids[:1], as_dataframe=False)
        smiles = props[0].get('SMILES') if props else None
        smiles_cache[cas] = smiles
        return smiles
    except Exception as e:
        print(f"Error fetching {cas}: {e}")
        smiles_cache[cas] = None
        return None

# Add progress bar to apply
tqdm.pandas(desc="Fetching SMILES")
df['SMILES'] = df[cas_column].astype(str).progress_apply(fetch_smiles)

# Reorder columns
df = df[[cas_column, 'SMILES'] + [col for col in df.columns if col not in [cas_column, 'SMILES']]]

# Save to CSV
df.to_csv(output_file, index=False)
print(f"Output saved to {output_file}")
