In [None]:
# Setup and imports
import sys
import os

# Add the project root to the Python path
# The notebook is in notebooks/exploratory, so we need to go up two levels to reach the project root
project_root = os.path.abspath(os.path.join(os.path.dirname("__file__"), "../.."))
sys.path.append(project_root)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from rdkit import Chem
from rdkit.Chem import Draw
import warnings

warnings.filterwarnings("ignore")

# Import our Quantifyr modules
from src.data_utils.data_utils import (
    parse_smiles,
    extract_atom_features,
    extract_bond_features,
    mol_to_graph,
    compute_molecular_descriptors,
    load_molecule_dataset,
    create_molecular_dataframe,
    MoleculeConfig,
)

from src.viz.viz import (
    # Theme control functions (NEW!)
    set_theme,
    get_current_theme,
    # Enhanced visualization functions
    draw_molecule_2d,  # Now interactive plots with theme support!
    plot_molecular_properties,
    plot_property_distribution,
    plot_molecular_network,  # Now supports meaningful titles
    create_3d_conformer_plot,  # Now with perfect zoom and smart titles
    # Note: create_molecular_dashboard removed for better flexibility
    # Use individual plotting functions for full control
)

print("✅ Successfully imported Quantifyr modules!")
print("🧪 Ready for molecular data processing experiments")

In [None]:
# Demonstrate theme control
print("🌓 Theme Control System:")
print("Choose your visualization theme preference!\n")

# Show current theme
current_theme = get_current_theme()
print(f"Current theme: {current_theme}")

# Set to dark theme (default)
set_theme("dark")
print("\n✅ Set to DARK theme - midnight background for coding sessions")
print("   - Background: #0d1117 (GitHub dark)")
print("   - Accent: #00acc1 (cyan)")
print("   - Text: #c9d1d9 (light gray)")

# You can switch to light theme for presentations
# set_theme('light')
print("\n💡 To switch to LIGHT theme for presentations, run:")
print("   set_theme('light')")
print("   - Background: #ffffff (white)")
print("   - Accent: #0969da (blue)")
print("   - Text: #24292f (dark gray)")

print("\n🎨 All subsequent plots will automatically use your chosen theme!")
print("   Smart text contrast ensures element symbols are readable on all atom colors")

In [None]:
# Define a diverse set of molecules for testing
sample_molecules = {
    "ethanol": "CCO",
    "benzene": "c1ccccc1",
    "caffeine": "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",
    "aspirin": "CC(=O)OC1=CC=CC=C1C(=O)O",
    "water": "O",
    "methane": "C",
    "glucose": "C([C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)O)O)O)O)O",
    "dopamine": "NCCc1ccc(O)c(O)c1",
    "invalid": "this_is_not_a_smiles",  # Test invalid SMILES
}

print("🧪 Testing SMILES parsing with diverse molecules:\n")

parsed_molecules = {}
for name, smiles in sample_molecules.items():
    mol = parse_smiles(smiles)
    parsed_molecules[name] = mol

    if mol is not None:
        print(
            f"✅ {name:12} | {smiles:20} → {mol.GetNumAtoms():2d} atoms, {mol.GetNumBonds():2d} bonds"
        )
    else:
        print(f"❌ {name:12} | {smiles:20} → FAILED TO PARSE")

print(
    f"\n📊 Successfully parsed: {sum(1 for mol in parsed_molecules.values() if mol is not None)}/{len(sample_molecules)} molecules"
)

In [None]:
# Extract features for a few interesting molecules with element symbols
from src.data_utils.data_utils import get_element_symbol

test_molecules = ["ethanol", "benzene", "caffeine"]

print("🔬 Atomic Feature Extraction with Element Symbols:\n")
print("Features: [Atomic#, Degree, Charge, Hybridization, Aromatic, H_count]")
print("-" * 70)

for name in test_molecules:
    mol = parsed_molecules[name]
    if mol is not None:
        atom_features = extract_atom_features(mol)
        bond_features = extract_bond_features(mol)

        print(f"\n{name.upper()}:")
        print(f"  Atomic features shape: {atom_features.shape}")
        print(f"  Bond features shape:   {bond_features.shape}")

        # Show first few atomic features with element symbols
        print(f"  First 3 atoms with element symbols:")
        for i in range(min(3, len(atom_features))):
            features = atom_features[i]
            atomic_num = int(features[0])
            element_symbol = get_element_symbol(atomic_num)
            print(f"    Atom {i+1:2d} ({element_symbol:>2s}): {features}")

        # Explain the first atom's features in detail
        if len(atom_features) > 0:
            first_atom = atom_features[0]
            element = get_element_symbol(int(first_atom[0]))
            print(f"  🧪 Feature explanation for first atom ({element}):")
            print(f"     - Element: {element} (atomic# {int(first_atom[0])})")
            print(f"     - Degree: {int(first_atom[1])} bonds")
            print(f"     - Charge: {int(first_atom[2])}")
            print(
                f"     - Hybridization: SP{int(first_atom[3]) if first_atom[3] <= 3 else '?'}"
            )
            print(f"     - Aromatic: {'Yes' if first_atom[4] else 'No'}")
            print(f"     - H atoms: {int(first_atom[5])}")

print("\n" + "=" * 70)
print("✨ Now you can see actual element symbols (C, N, O) instead of just numbers!")

In [None]:
# Compute molecular descriptors for property prediction
print("📊 Molecular Descriptors for Property Prediction:\n")

descriptor_data = []
for name, mol in parsed_molecules.items():
    if mol is not None:
        descriptors = compute_molecular_descriptors(mol)
        descriptors["name"] = name
        descriptors["smiles"] = sample_molecules[name]
        descriptor_data.append(descriptors)

# Create DataFrame
df_descriptors = pd.DataFrame(descriptor_data)
print(f"Computed descriptors for {len(df_descriptors)} molecules")
print("\nDescriptor columns:", list(df_descriptors.columns))
print("\nSample data:")
print(df_descriptors[["name", "molecular_weight", "logp", "tpsa", "num_rings"]].head())

In [None]:
# Convert molecules to graph representations
print("🕸️  Converting Molecules to Graph Objects:\n")

graph_data = []
for name in ["ethanol", "benzene", "caffeine"]:
    mol = parsed_molecules[name]
    if mol is not None:
        graph = mol_to_graph(mol, include_edge_features=True)
        graph_data.append(graph)

        print(f"{name.upper()}:")
        print(f"  Node features (x):     {graph.x.shape}")
        print(f"  Edge indices:          {graph.edge_index.shape}")
        if hasattr(graph, "edge_attr"):
            print(f"  Edge features:         {graph.edge_attr.shape}")
        print(f"  Nodes: {graph.x.shape[0]}, Edges: {graph.edge_index.shape[1]}")
        print()

print(f"✅ Successfully created {len(graph_data)} graph objects for GNN training!")

In [None]:
# Create interactive 2D molecular visualizations!
molecules_to_draw = ["ethanol", "benzene", "caffeine", "aspirin"]

print("🎨 Creating Interactive 2D Molecular Plots with Theme Support...\n")

for name in molecules_to_draw:
    mol = parsed_molecules[name]
    if mol is not None:
        print(f"✨ Creating interactive plot for {name.title()}...")

        # NEW: Interactive plotly figure instead of static image!
        fig = draw_molecule_2d(mol, size=(450, 350))

        # Add custom title with molecule info
        fig.update_layout(
            title=f"{name.title()} - {sample_molecules[name]}<br><sub>{mol.GetNumAtoms()} atoms, {mol.GetNumBonds()} bonds</sub>"
        )

        fig.show()

        print(f"   ✅ Atom count: {mol.GetNumAtoms()}")
        print(
            f"   ✅ Features: Interactive hover, element symbols, smart text contrast"
        )
        print()

print("🎉 NEW Interactive 2D Features:")
print("   ✓ Interactive plots instead of static images")
print("   ✓ Theme-matched backgrounds and bond colors")
print("   ✓ Smart text contrast - readable on all atom colors")
print("   ✓ CPK element coloring with element symbols (C, N, O, etc.)")
print("   ✓ Hover information showing element type and coordinates")
print("   ✓ Different bond styles for single/double/triple bonds")
print("   ✓ Can save as interactive HTML files!")

In [None]:
# Create beautiful 3D molecular visualizations with smart features!
print("🌐 Creating Beautiful 3D Molecular Visualizations...\n")

# Let's visualize caffeine in 3D - it's a complex molecule!
caffeine_mol = parsed_molecules["caffeine"]
if caffeine_mol is not None:
    print("☕ Creating 3D visualization of Caffeine...")
    # Now with smart title and auto-zoom based on molecule size!
    fig_3d = create_3d_conformer_plot(
        caffeine_mol, molecule_name="Caffeine", smiles="CN1C=NC2=C1C(=O)N(C(=O)N2C)C"
    )
    fig_3d.show()

    print("✅ 3D caffeine visualization complete!")
    print("🎨 NEW smart features you get automatically:")
    print("   ✓ Meaningful title with molecule name")
    print("   ✓ Smart zoom - starts at perfect distance for molecule size")
    print("   ✓ Better text contrast - black text for visibility on all atoms")
    print("   ✓ Clean toolbar with only essential 3D controls")
    print("   ✓ Element symbols (C, N, O) clearly visible")
    print("   ✓ Professional CPK coloring and Quantifyr dark theme")
    print("💡 Much easier to use from the start - no manual zooming needed!")
else:
    print("❌ Could not create 3D visualization - caffeine molecule not available")

In [None]:
# Create molecular property analysis
print("📊 Analyzing Molecular Properties...\n")

# Create a larger dataset for better analysis
extended_smiles = [
    "CCO",
    "CC",
    "CCC",
    "CCCC",
    "CCCCC",  # Alkanes & ethanol
    "c1ccccc1",
    "c1ccc(C)cc1",
    "c1ccc(O)cc1",  # Aromatics
    "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",  # Caffeine
    "CC(=O)OC1=CC=CC=C1C(=O)O",  # Aspirin
    "NCCc1ccc(O)c(O)c1",  # Dopamine
    "CC(C)C(=O)O",  # Isobutyric acid
    "CCCCCCCCCCCCCCCC(=O)O",  # Palmitic acid
]

# Create comprehensive molecular dataframe
print("🔬 Processing extended molecule dataset...")
df_analysis = create_molecular_dataframe(extended_smiles)
print(f"✅ Processed {len(df_analysis)} molecules")
print(f"✅ {df_analysis['valid'].sum()} valid molecules for analysis")

# Show basic statistics
print("\n📈 Basic Property Statistics:")
numeric_cols = ["molecular_weight", "logp", "tpsa", "num_rotatable_bonds", "num_rings"]
print(df_analysis[numeric_cols].describe().round(2))

In [None]:
# Create interactive property scatter matrix
print("🎨 Creating Interactive Property Scatter Matrix...")

properties_to_plot = ["molecular_weight", "logp", "tpsa", "num_rings"]
fig_scatter = plot_molecular_properties(df_analysis, properties_to_plot)
fig_scatter.update_layout(
    title="Interactive Molecular Property Relationships", height=700, width=900
)
fig_scatter.show()

print("✅ Interactive scatter matrix complete!")
print("💡 Hover over points to see SMILES strings!")

In [None]:
# Analyze molecular weight distribution
print("📊 Analyzing Molecular Weight Distribution...")

fig_dist = plot_property_distribution(df_analysis, "molecular_weight")
fig_dist.update_layout(
    title="Molecular Weight Distribution Analysis", height=600, width=800
)
fig_dist.show()

print("✅ Distribution analysis complete!")

# Show some interesting insights
print("\n🔍 Interesting Insights:")
heavy_molecules = df_analysis[df_analysis["molecular_weight"] > 200]
light_molecules = df_analysis[df_analysis["molecular_weight"] < 50]

print(f"• Heaviest molecules (>200 Da): {len(heavy_molecules)}")
if len(heavy_molecules) > 0:
    print(f"  - Examples: {heavy_molecules['smiles'].tolist()}")

print(f"• Lightest molecules (<50 Da): {len(light_molecules)}")
if len(light_molecules) > 0:
    print(f"  - Examples: {light_molecules['smiles'].tolist()}")

In [None]:
# Create beautiful molecular network visualizations with smart titles and visibility!
print("🕸️  Creating Molecular Network Visualizations with Smart Features...")

# Convert some interesting molecules to graphs
network_smiles = [
    "CCO",
    "c1ccccc1",
    "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",
    "CC(=O)OC1=CC=CC=C1C(=O)O",
]
network_names = ["Ethanol", "Benzene", "Caffeine", "Aspirin"]
network_graphs, _ = load_molecule_dataset(network_smiles)

print(f"✅ Created {len(network_graphs)} molecular graphs")

# Visualize with meaningful titles and enhanced readability
fig_network = plot_molecular_network(
    network_graphs,
    max_molecules=4,
    molecule_names=network_names,  # NEW: Meaningful titles!
    smiles_list=network_smiles,
)
fig_network.show()

print("✅ Molecular network visualization complete!")
print("🎨 NEW smart features you get automatically:")
print("   ✓ Meaningful titles - 'Ethanol', 'Benzene' instead of 'Molecule 1, 2, 3...'")
print("   ✓ Smart text contrast - dark text on light atoms, light text on dark atoms")
print("   ✓ Larger atoms (22px) with better element symbol visibility")
print("   ✓ Clean toolbar with simplified controls")
print("   ✓ Professional CPK coloring with theme-adaptive bonds")
print("   ✓ 3-column layout for optimal readability")
print("   ✓ Enhanced hover information with molecule context")
print("💡 Each atom clearly shows its element symbol - C, N, O, etc.!")
print("💡 Text is readable on ALL atom colors - no more guessing what element it is!")