<a href="https://colab.research.google.com/github/Swayamprakashpatel/DD/blob/main/CID_to_SMILE_%26_Fingerprint.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**PUBCHEM CID TO FINGERPRINT GENERATION**

In [None]:
# Install PubChemPy if not already installed
!pip install pubchempy

import pandas as pd
import pubchempy as pcp
from tqdm import tqdm
import time as tm

# File paths
input_file = "/content/P1-09-Target_compound_activity.csv"  # Update with your file path
output_file = "pubchem_smiles_fingerprints.csv"

# Load the PubChem IDs from CSV
df = pd.read_csv(input_file)
pubchem_ids = df['Pubchem CID'].tolist()

# Function to fetch SMILES and fingerprint
def fetch_smiles_and_fingerprint(pubchem_id):
    try:
        # Fetch compound using PubChemPy
        compound = pcp.Compound.from_cid(pubchem_id)

        # Get SMILES
        smiles = compound.canonical_smiles

        # Get fingerprint
        fingerprint = compound.cactvs_fingerprint

        return smiles, fingerprint
    except Exception as e:
        print(f"Error fetching data for PubChemID {pubchem_id}: {e}")
        return None, None

# Initialize results list
results = []

# Iterate over PubChem IDs and fetch data
for pubchem_id in tqdm(pubchem_ids, desc="Fetching SMILES and fingerprints"):
    smiles, fingerprint = fetch_smiles_and_fingerprint(pubchem_id)

    # If data is successfully fetched, process it
    if smiles and fingerprint:
        result = {"PubChemID": pubchem_id, "SMILES": smiles}

        # Convert the fingerprint to a list of individual bits
        for i, bit in enumerate(fingerprint):
            result[f"FP_{i+1}"] = bit

        results.append(result)

    # Sleep for a while to avoid overwhelming the server
    tm.sleep(0.1)

# Convert results to a DataFrame
output_df = pd.DataFrame(results)

# Save the output DataFrame to CSV
output_df.to_csv(output_file, index=False)

print(f"Final data saved to {output_file}")


from google.colab import files
files.download('/content/pubchem_smiles_fingerprints.csv')




Fetching SMILES and fingerprints:   0%|          | 445/803580 [01:45<55:22:37,  4.03it/s]

In [2]:
import pandas as pd
from pubchempy import get_compounds

# File paths
input_file = "/content/P1-09-Target_compound_activity.csv"  # Update with your file path

# Load the PubChem IDs from CSV
df = pd.read_csv(input_file)

# Select the first five PubChem CIDs
pubchem_ids = df['Pubchem CID'].head(5).tolist()

# Function to fetch SMILES
def fetch_smiles(pubchem_id):
    try:
        compounds = get_compounds(pubchem_id, 'cid')
        if compounds:
            compound = compounds[0]
            smiles = compound.canonical_smiles
            return smiles
        else:
            return None
    except Exception as e:
        print(f"Error fetching data for PubChemID {pubchem_id}: {e}")
        return None

# Check if SMILES can be fetched for the first five CIDs
for pubchem_id in pubchem_ids:
    smiles = fetch_smiles(pubchem_id)
    print(f"PubChem CID: {pubchem_id}, SMILES: {smiles}")


PubChem CID: 17280, SMILES: CCOP(=O)(OCC)OC1=CC=CC=C1
PubChem CID: 197934, SMILES: CCOP(=O)(OCC)OC1=CC=C(C=C1)N
PubChem CID: 639433, SMILES: C1=CC=C2C(=C1)N=CN2CN3C4=CC=CC=C4N=N3
PubChem CID: 907361, SMILES: CC1=C(C2=C(C=C1)C(=CC(=O)O2)CCl)C
PubChem CID: 940069, SMILES: CC1=CC(=C2C(=CC(=O)OC2=C1)CCl)C


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
!pip install pubchempy
import pubchempy as pcp
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense
from google.colab import files
import time as tm

Collecting pubchempy
  Downloading PubChemPy-1.0.4.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pubchempy
  Building wheel for pubchempy (setup.py) ... [?25l[?25hdone
  Created wheel for pubchempy: filename=PubChemPy-1.0.4-py3-none-any.whl size=13820 sha256=6d6cb7896450d88ce03e93b7ef79312bc36a213760b3867acbd2792932f24d82
  Stored in directory: /root/.cache/pip/wheels/90/7c/45/18a0671e3c3316966ef7ed9ad2b3f3300a7e41d3421a44e799
Successfully built pubchempy
Installing collected packages: pubchempy
Successfully installed pubchempy-1.0.4


In [None]:
#Just Example
c = pcp.Compound.from_cid(3033877)
c.cactvs_fingerprint

In [None]:
#FOR DRUGS

df1 = pd.read_csv('/content/Drug.csv')
#df1 = df.iloc[0:775,:]
#print(df1)

cidvals = df1['CID']
componentFingerprint = []
for cid in cidvals:
  c = pcp.Compound.from_cid(cid)
  componentFingerprint.append(c.cactvs_fingerprint)
  tm.sleep(0.1)

df2 = df1.assign(fp=componentFingerprint)
i = 1
for  componentFingerprint in c.cactvs_fingerprint:
    df2['fp'] = df2['fp'].astype(str)
    df2['fp'+str (i)] = df2['fp'].str[i-1:i]
    i = i+ 1
#print(componentFingerprint)
#print(df2)

df2.to_csv('Drug.csv')

In [None]:
#FOR EXCIPIENT

df1 = pd.read_excel('/content/Excipient.xlsx')
cidvals = df1['CID']
componentFingerprint = []
for cid in cidvals:
  c = pcp.Compound.from_cid(cid)
  componentFingerprint.append(c.cactvs_fingerprint)
  tm.sleep(0.1)

df2 = df1.assign(fp=componentFingerprint)
i = 1
for  componentFingerprint in c.cactvs_fingerprint:
    df2['fp'] = df2['fp'].astype(str)
    df2['fp'+str (i)] = df2['fp'].str[i-1:i]
    i = i+ 1
#print(componentFingerprint)
#print(df2)
df2.to_csv('FP_Excipient.csv')

**CID TO SMILE**

In [None]:
import pandas as pd
import pubchempy as pcp
import time as tm

# Load the GNN.csv file
df = pd.read_csv('/content/GNN.csv')

# Lists to store SMILES for drugs and solvents
drug_smiles = []
solvent_smiles = []

# Convert drug CIDs to SMILES
for drug_cid in df['drug_cid']:
    try:
        drug = pcp.Compound.from_cid(drug_cid)
        drug_smiles.append(drug.canonical_smiles)
    except:
        drug_smiles.append(None)  # Append None if conversion fails
    tm.sleep(0.1)  # To avoid hitting API limits

# Convert solvent CIDs to SMILES
for solvent_cid in df['solvent_cid']:
    try:
        solvent = pcp.Compound.from_cid(solvent_cid)
        solvent_smiles.append(solvent.canonical_smiles)
    except:
        solvent_smiles.append(None)  # Append None if conversion fails
    tm.sleep(0.1)

# Create a new DataFrame with the required columns
df_result = pd.DataFrame({
    'Drug_Smile': drug_smiles,
    'Solvent_Smile': solvent_smiles,
    'Solubility': df['solubility']
})

# Save the result to a new CSV file
df_result.to_csv('GNN_Smiles.csv', index=False)

# Display the final DataFrame
print(df_result)


                                     Drug_Smile  \
0     C1C(OC2=CC(=CC(=C2C1=O)O)O)C3=CC=C(C=C3)O   
1     C1C(OC2=CC(=CC(=C2C1=O)O)O)C3=CC=C(C=C3)O   
2     C1C(OC2=CC(=CC(=C2C1=O)O)O)C3=CC=C(C=C3)O   
3     C1C(OC2=CC(=CC(=C2C1=O)O)O)C3=CC=C(C=C3)O   
4     C1C(OC2=CC(=CC(=C2C1=O)O)O)C3=CC=C(C=C3)O   
...                                         ...   
1374  C1NC2=CC(=C(C=C2S(=O)(=O)N1)S(=O)(=O)N)Cl   
1375  C1NC2=CC(=C(C=C2S(=O)(=O)N1)S(=O)(=O)N)Cl   
1376  C1NC2=CC(=C(C=C2S(=O)(=O)N1)S(=O)(=O)N)Cl   
1377  C1NC2=CC(=C(C=C2S(=O)(=O)N1)S(=O)(=O)N)Cl   
1378  C1NC2=CC(=C(C=C2S(=O)(=O)N1)S(=O)(=O)N)Cl   

                                          Solvent_Smile  Solubility  
0     C1=CC=C(C=C1)CC(C(=O)NC(CCCCN)C(=O)N)NC(=O)C(C...        4.86  
1                       CCCCCCCCC=CCCCCCCCC(=O)OCC(CO)O       15.57  
2                      CCCCCC=CCC=CCCCCCCCC(=O)OCC(CO)O       27.58  
3                                         CCCCC(CCCO)Cl       17.84  
4                                  CC