In [None]:
# Setup and imports
import sys
import os

sys.path.append("../src")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from rdkit import Chem
from rdkit.Chem import Draw
import warnings

warnings.filterwarnings("ignore")

# Import our Quantifyr modules
from data_utils import (
    parse_smiles,
    extract_atom_features,
    extract_bond_features,
    mol_to_graph,
    compute_molecular_descriptors,
    load_molecule_dataset,
    create_molecular_dataframe,
    MoleculeConfig,
)

from viz import (
    draw_molecule_2d,
    plot_molecular_properties,
    plot_property_distribution,
    plot_molecular_network,
    create_3d_conformer_plot,
    create_molecular_dashboard,
)

print("✅ Successfully imported Quantifyr modules!")
print("🧪 Ready for molecular data processing experiments")

In [None]:
# Define a diverse set of molecules for testing
sample_molecules = {
    "ethanol": "CCO",
    "benzene": "c1ccccc1",
    "caffeine": "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",
    "aspirin": "CC(=O)OC1=CC=CC=C1C(=O)O",
    "water": "O",
    "methane": "C",
    "glucose": "C([C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)O)O)O)O)O",
    "dopamine": "NCCc1ccc(O)c(O)c1",
    "invalid": "this_is_not_a_smiles",  # Test invalid SMILES
}

print("🧪 Testing SMILES parsing with diverse molecules:\n")

parsed_molecules = {}
for name, smiles in sample_molecules.items():
    mol = parse_smiles(smiles)
    parsed_molecules[name] = mol

    if mol is not None:
        print(
            f"✅ {name:12} | {smiles:20} → {mol.GetNumAtoms():2d} atoms, {mol.GetNumBonds():2d} bonds"
        )
    else:
        print(f"❌ {name:12} | {smiles:20} → FAILED TO PARSE")

print(
    f"\n📊 Successfully parsed: {sum(1 for mol in parsed_molecules.values() if mol is not None)}/{len(sample_molecules)} molecules"
)

In [None]:
# Extract features for a few interesting molecules
test_molecules = ["ethanol", "benzene", "caffeine"]

print("🔬 Atomic Feature Extraction:\n")
print("Features: [Atomic#, Degree, Charge, Hybridization, Aromatic, H_count]")
print("-" * 70)

for name in test_molecules:
    mol = parsed_molecules[name]
    if mol is not None:
        atom_features = extract_atom_features(mol)
        bond_features = extract_bond_features(mol)

        print(f"\n{name.upper()}:")
        print(f"  Atomic features shape: {atom_features.shape}")
        print(f"  Bond features shape:   {bond_features.shape}")

        # Show first few atomic features
        print(f"  First 3 atoms: ")
        for i in range(min(3, len(atom_features))):
            features = atom_features[i]
            print(f"    Atom {i+1}: {features}")

print("\n" + "=" * 70)

In [None]:
# Compute molecular descriptors for property prediction
print("📊 Molecular Descriptors for Property Prediction:\n")

descriptor_data = []
for name, mol in parsed_molecules.items():
    if mol is not None:
        descriptors = compute_molecular_descriptors(mol)
        descriptors["name"] = name
        descriptors["smiles"] = sample_molecules[name]
        descriptor_data.append(descriptors)

# Create DataFrame
df_descriptors = pd.DataFrame(descriptor_data)
print(f"Computed descriptors for {len(df_descriptors)} molecules")
print("\nDescriptor columns:", list(df_descriptors.columns))
print("\nSample data:")
print(df_descriptors[["name", "molecular_weight", "logp", "tpsa", "num_rings"]].head())

In [None]:
# Convert molecules to graph representations
print("🕸️  Converting Molecules to Graph Objects:\n")

graph_data = []
for name in ["ethanol", "benzene", "caffeine"]:
    mol = parsed_molecules[name]
    if mol is not None:
        graph = mol_to_graph(mol, include_edge_features=True)
        graph_data.append(graph)

        print(f"{name.upper()}:")
        print(f"  Node features (x):     {graph.x.shape}")
        print(f"  Edge indices:          {graph.edge_index.shape}")
        if hasattr(graph, "edge_attr"):
            print(f"  Edge features:         {graph.edge_attr.shape}")
        print(f"  Nodes: {graph.x.shape[0]}, Edges: {graph.edge_index.shape[1]}")
        print()

print(f"✅ Successfully created {len(graph_data)} graph objects for GNN training!")

In [None]:
# Create 2D molecular drawings
import matplotlib.pyplot as plt

molecules_to_draw = ["ethanol", "benzene", "caffeine", "aspirin"]
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.flatten()

print("🎨 Creating 2D molecular structure drawings...\n")

for i, name in enumerate(molecules_to_draw):
    mol = parsed_molecules[name]
    if mol is not None:
        # Draw molecule
        img = draw_molecule_2d(mol, size=(400, 400))

        # Display in subplot
        axes[i].imshow(img)
        axes[i].set_title(
            f"{name.title()}\n{sample_molecules[name]}", fontsize=12, pad=20
        )
        axes[i].axis("off")

        print(f"✅ Drew {name} ({mol.GetNumAtoms()} atoms)")

plt.tight_layout()
plt.suptitle("2D Molecular Structures", fontsize=16, y=0.98)
plt.show()

print("\n🎉 Beautiful 2D molecular visualizations complete!")

In [None]:
# Create interactive 3D molecular visualizations
print("🌐 Creating Interactive 3D Molecular Visualizations...\n")

# Let's visualize caffeine in 3D - it's a complex molecule!
caffeine_mol = parsed_molecules["caffeine"]
if caffeine_mol is not None:
    print("☕ Creating 3D visualization of Caffeine...")
    fig_3d = create_3d_conformer_plot(caffeine_mol)
    fig_3d.update_layout(
        title="Interactive 3D Structure of Caffeine (C₈H₁₀N₄O₂)", width=800, height=600
    )
    fig_3d.show()

    print("✅ 3D caffeine visualization complete!")
    print("💡 You can rotate, zoom, and pan the 3D structure above!")
else:
    print("❌ Could not create 3D visualization - caffeine molecule not available")

In [None]:
# Create molecular property analysis
print("📊 Analyzing Molecular Properties...\n")

# Create a larger dataset for better analysis
extended_smiles = [
    "CCO",
    "CC",
    "CCC",
    "CCCC",
    "CCCCC",  # Alkanes & ethanol
    "c1ccccc1",
    "c1ccc(C)cc1",
    "c1ccc(O)cc1",  # Aromatics
    "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",  # Caffeine
    "CC(=O)OC1=CC=CC=C1C(=O)O",  # Aspirin
    "NCCc1ccc(O)c(O)c1",  # Dopamine
    "CC(C)C(=O)O",  # Isobutyric acid
    "CCCCCCCCCCCCCCCC(=O)O",  # Palmitic acid
]

# Create comprehensive molecular dataframe
print("🔬 Processing extended molecule dataset...")
df_analysis = create_molecular_dataframe(extended_smiles)
print(f"✅ Processed {len(df_analysis)} molecules")
print(f"✅ {df_analysis['valid'].sum()} valid molecules for analysis")

# Show basic statistics
print("\n📈 Basic Property Statistics:")
numeric_cols = ["molecular_weight", "logp", "tpsa", "num_rotatable_bonds", "num_rings"]
print(df_analysis[numeric_cols].describe().round(2))

In [None]:
# Create interactive property scatter matrix
print("🎨 Creating Interactive Property Scatter Matrix...")

properties_to_plot = ["molecular_weight", "logp", "tpsa", "num_rings"]
fig_scatter = plot_molecular_properties(df_analysis, properties_to_plot)
fig_scatter.update_layout(
    title="Interactive Molecular Property Relationships", height=700, width=900
)
fig_scatter.show()

print("✅ Interactive scatter matrix complete!")
print("💡 Hover over points to see SMILES strings!")

In [None]:
# Analyze molecular weight distribution
print("📊 Analyzing Molecular Weight Distribution...")

fig_dist = plot_property_distribution(df_analysis, "molecular_weight")
fig_dist.update_layout(
    title="Molecular Weight Distribution Analysis", height=600, width=800
)
fig_dist.show()

print("✅ Distribution analysis complete!")

# Show some interesting insights
print("\n🔍 Interesting Insights:")
heavy_molecules = df_analysis[df_analysis["molecular_weight"] > 200]
light_molecules = df_analysis[df_analysis["molecular_weight"] < 50]

print(f"• Heaviest molecules (>200 Da): {len(heavy_molecules)}")
if len(heavy_molecules) > 0:
    print(f"  - Examples: {heavy_molecules['smiles'].tolist()}")

print(f"• Lightest molecules (<50 Da): {len(light_molecules)}")
if len(light_molecules) > 0:
    print(f"  - Examples: {light_molecules['smiles'].tolist()}")

In [None]:
# Create molecular network visualizations
print("🕸️  Creating Molecular Network Visualizations...")

# Convert some interesting molecules to graphs
network_smiles = [
    "CCO",
    "c1ccccc1",
    "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",
    "CC(=O)OC1=CC=CC=C1C(=O)O",
]
network_graphs, _ = load_molecule_dataset(network_smiles)

print(f"✅ Created {len(network_graphs)} molecular graphs")

# Visualize as networks
fig_network = plot_molecular_network(network_graphs, max_molecules=4)
fig_network.update_layout(
    title="Molecular Graph Networks - Atoms as Nodes, Bonds as Edges",
    height=800,
    width=1200,
)
fig_network.show()

print("✅ Molecular network visualization complete!")
print("💡 Each node represents an atom, colored by atomic number")
print("💡 Each edge represents a chemical bond")

In [None]:
# Create comprehensive molecular dashboard
print("🚀 Creating Comprehensive Molecular Dashboard...\n")

# Prepare molecules for dashboard
dashboard_molecules = [
    parsed_molecules[name]
    for name in parsed_molecules.keys()
    if parsed_molecules[name] is not None
]

# Create dashboard
dashboard = create_molecular_dashboard(df_analysis, dashboard_molecules)

print(f"✅ Dashboard created with {len(dashboard)} visualizations:")
for name in dashboard.keys():
    print(f"  📊 {name}")

print("\n🎨 Displaying dashboard components...")

# Display each component
for name, fig in dashboard.items():
    print(f"\n--- {name.upper().replace('_', ' ')} ---")
    fig.update_layout(width=900, height=500)
    fig.show()

print("\n🎉 Complete molecular dashboard demonstration finished!")
print("💡 This shows the power of our modular Quantifyr implementation!")