# Drug Graph & ECFP4 Fingerprint Generator

This notebook parses SMILES strings, generates ECFP4 (Morgan) fingerprints, and produces molecular graph data suitable for machine learning and GNN workflows.

In [2]:
# Install RDKit if needed
!pip install rdkit-pypi
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
import networkx as nx
import json
import os



In [4]:
# Example SMILES input
data = [
    ("Ivacaftor", "CC(C)(C)C1=CC(=C(C=C1NC(=O)C2=CNC3=CC=CC=C3C2=O)O)C(C)(C)C")

]

# Output directory
os.makedirs("output/molecular_graphs", exist_ok=True)

fingerprints = []
for name, smiles in data:
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024)
    fp_array = np.array(fp)
    fingerprints.append([name, smiles] + fp_array.tolist())

    # Graph creation
    G = nx.Graph()
    for atom in mol.GetAtoms():
        G.add_node(atom.GetIdx(),
                   atom_num=atom.GetAtomicNum(),
                   aromatic=atom.GetIsAromatic(),
                   hybridization=str(atom.GetHybridization()))
    for bond in mol.GetBonds():
        G.add_edge(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx(),
                   bond_type=str(bond.GetBondType()))

    with open(f"output/molecular_graphs/{name}.json", "w") as f:
        json.dump(nx.node_link_data(G), f)

# Save fingerprints
cols = ["Name", "SMILES"] + [f"Bit_{i}" for i in range(1024)]
fp_df = pd.DataFrame(fingerprints, columns=cols)
fp_df.to_csv("output/fingerprints.csv", index=False)
print("Done. Files saved in /output")

Done. Files saved in /output


In [6]:
#Download files
from google.colab import files

# Download the fingerprints.csv file
files.download('output/fingerprints.csv')

# Download the Ivacaftor.json file
files.download('output/molecular_graphs/Ivacaftor.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>