# Day 4: Quantum Chemistry & Electronic Structure Project

## 🎯 **PROJECT OVERVIEW**

**Duration:** 6 hours intensive coding
**Focus:** Quantum chemistry calculations, electronic structure theory, and ML integration
**Tools:** Psi4, PySCF, ASE, RDKit integration

### **Learning Objectives**
- Implement quantum chemistry calculations from scratch
- Master electronic structure methods (HF, DFT, post-HF)
- Build ML models for quantum property prediction
- Create automated QM calculation pipelines
- Develop quantum-classical hybrid workflows

### **Project Deliverables**
1. **QuantumChemistryEngine** - Complete QM calculation framework
2. **ElectronicStructureML** - ML models for QM properties
3. **QMDataPipeline** - Automated calculation workflows
4. **HybridQM-MLFramework** - Integrated quantum-classical system
5. **Production Portfolio** - Professional quantum chemistry toolkit

---

## 📋 **PREREQUISITES**

```bash
# Required installations
pip install psi4 pyscf ase rdkit deepchem
conda install -c psi4 psi4
```

In [None]:
# Essential imports for quantum chemistry
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Quantum chemistry libraries
try:
    import psi4
    print("✅ Psi4 available")
except ImportError:
    print("❌ Psi4 not available - install with: conda install -c psi4 psi4")

try:
    import pyscf
    from pyscf import gto, scf, dft, cc, mp
    print("✅ PySCF available")
except ImportError:
    print("❌ PySCF not available - install with: pip install pyscf")

# Chemistry and ML libraries
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem
import deepchem as dc
from ase import Atoms
from ase.optimize import BFGS

# ML libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import torch
import torch.nn as nn
import torch.optim as optim

# Set random seeds
np.random.seed(42)
torch.manual_seed(42)

print("🚀 Day 4 Environment Ready - Quantum Chemistry & Electronic Structure")

# ASSESSMENT FRAMEWORK INITIALIZATION
print("\n" + "="*70)
print("🎓 DAY 4 ASSESSMENT FRAMEWORK INITIALIZATION")
print("="*70)

try:
    from assessment_framework import create_assessment
    print("✅ Assessment framework loaded successfully")
except ImportError:
    print("⚠️ Assessment framework not found. Please ensure assessment_framework.py is available.")
    print("📁 Expected location: same directory as this notebook")
    # Create a basic assessment object for fallback
    class BasicAssessment:
        def start_section(self, section): pass
        def end_section(self, section): pass
        def record_activity(self, activity, result, metadata=None): pass
        def get_progress_summary(self): return {"overall_score": 0.0, "section_scores": {}}
        def get_comprehensive_report(self): return {"activities": []}
        def save_final_report(self, filename): pass
    
    def create_assessment(student_id, track="quantum_chemistry"):
        return BasicAssessment()

# Student Information Collection
print("\n📝 Student Assessment Setup:")
student_id = input("Enter your student ID: ").strip()
if not student_id:
    student_id = f"student_day4_{np.random.randint(1000, 9999)}"
    print(f"Generated ID: {student_id}")

# Track Selection
print("\n🎯 Select your specialization track:")
print("1. 🔬 Computational Chemist")
print("2. ⚛️ Quantum Chemistry Researcher") 
print("3. 🧬 Materials Scientist")
print("4. 🤖 Quantum ML Developer")

track_choice = input("Enter choice (1-4): ").strip()
track_map = {
    "1": "computational_chemist",
    "2": "quantum_chemistry", 
    "3": "materials_science",
    "4": "quantum_ml"
}
selected_track = track_map.get(track_choice, "quantum_chemistry")

print(f"✅ Track selected: {selected_track.replace('_', ' ').title()}")

# Initialize Assessment System
try:
    assessment = create_assessment(student_id, track=selected_track)
    print(f"✅ Assessment initialized for track: {selected_track}")
    print(f"👤 Student ID: {student_id}")
    
    # Start Day 4 assessment
    assessment.start_section("day_4_quantum_chemistry")
    print("\n🎯 Day 4 Assessment: Quantum Chemistry & Electronic Structure")
    print("📊 Progress tracking enabled - All activities will be recorded")
    
except Exception as e:
    print(f"⚠️ Assessment initialization warning: {e}")
    assessment = None

print("\n" + "="*70)
print("🚀 Ready to begin Day 4: Quantum Chemistry Project!")
print("="*70)

---

# 🧮 **SECTION 1: Quantum Chemistry Fundamentals** (90 minutes)

## Building a Complete Quantum Chemistry Engine

We'll implement core quantum chemistry methods from scratch, building a professional-grade calculation engine.

In [None]:
class QuantumChemistryEngine:
    """
    Professional quantum chemistry calculation engine
    Supports multiple QM methods and basis sets
    """
    
    def __init__(self, memory_gb=4, num_threads=4):
        self.memory_gb = memory_gb
        self.num_threads = num_threads
        self.results_cache = {}
        self.calculation_history = []
        
        # Configure Psi4 if available
        try:
            psi4.set_memory(f'{memory_gb} GB')
            psi4.set_num_threads(num_threads)
            psi4.core.set_output_file('psi4_output.dat', False)
            self.psi4_available = True
        except:
            self.psi4_available = False
            print("Warning: Psi4 not configured")
    
    def smiles_to_geometry(self, smiles, optimize=True):
        """
        Convert SMILES to 3D geometry using RDKit
        """
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            raise ValueError(f"Invalid SMILES: {smiles}")
        
        # Add hydrogens and generate 3D coordinates
        mol = Chem.AddHs(mol)
        AllChem.EmbedMolecule(mol, randomSeed=42)
        
        if optimize:
            AllChem.MMFFOptimizeMolecule(mol)
        
        # Extract coordinates
        conf = mol.GetConformer()
        atoms = []
        coordinates = []
        
        for i, atom in enumerate(mol.GetAtoms()):
            pos = conf.GetAtomPosition(i)
            atoms.append(atom.GetSymbol())
            coordinates.append([pos.x, pos.y, pos.z])
        
        return atoms, np.array(coordinates)
    
    def build_psi4_molecule(self, atoms, coordinates, charge=0, multiplicity=1):
        """
        Build Psi4 molecule object
        """
        geometry_string = f"{charge} {multiplicity}\n"
        
        for atom, coord in zip(atoms, coordinates):
            geometry_string += f"{atom} {coord[0]:.6f} {coord[1]:.6f} {coord[2]:.6f}\n"
        
        return psi4.geometry(geometry_string)
    
    def calculate_hartree_fock(self, smiles, basis='6-31G*', charge=0, multiplicity=1):
        """
        Perform Hartree-Fock calculation
        """
        if not self.psi4_available:
            return self._mock_calculation('HF', smiles, basis)
        
        try:
            # Generate geometry
            atoms, coords = self.smiles_to_geometry(smiles)
            molecule = self.build_psi4_molecule(atoms, coords, charge, multiplicity)
            
            # Set basis set
            psi4.set_options({'basis': basis})
            
            # Run HF calculation
            energy = psi4.energy('hf')
            
            # Get additional properties
            wfn = psi4.core.get_current_wavefunction()
            
            results = {
                'method': 'HF',
                'basis': basis,
                'energy': energy,
                'num_electrons': wfn.nalpha() + wfn.nbeta(),
                'num_orbitals': wfn.nmo(),
                'dipole': np.array([wfn.variable('SCF DIPOLE X'),
                                  wfn.variable('SCF DIPOLE Y'),
                                  wfn.variable('SCF DIPOLE Z')]),
                'homo_energy': wfn.epsilon_a().np[wfn.nalpha()-1],
                'lumo_energy': wfn.epsilon_a().np[wfn.nalpha()],
                'atoms': atoms,
                'coordinates': coords
            }
            
            # Calculate HOMO-LUMO gap
            results['homo_lumo_gap'] = results['lumo_energy'] - results['homo_energy']
            
            self.calculation_history.append(results)
            return results
            
        except Exception as e:
            print(f"HF calculation failed: {e}")
            return self._mock_calculation('HF', smiles, basis)
    
    def calculate_dft(self, smiles, functional='B3LYP', basis='6-31G*', charge=0, multiplicity=1):
        """
        Perform DFT calculation
        """
        if not self.psi4_available:
            return self._mock_calculation('DFT', smiles, basis, functional=functional)
        
        try:
            # Generate geometry
            atoms, coords = self.smiles_to_geometry(smiles)
            molecule = self.build_psi4_molecule(atoms, coords, charge, multiplicity)
            
            # Set calculation options
            psi4.set_options({
                'basis': basis,
                'dft_functional': functional
            })
            
            # Run DFT calculation
            energy = psi4.energy('dft')
            
            # Get wavefunction properties
            wfn = psi4.core.get_current_wavefunction()
            
            results = {
                'method': f'DFT-{functional}',
                'basis': basis,
                'energy': energy,
                'num_electrons': wfn.nalpha() + wfn.nbeta(),
                'num_orbitals': wfn.nmo(),
                'dipole': np.array([wfn.variable('SCF DIPOLE X'),
                                  wfn.variable('SCF DIPOLE Y'),
                                  wfn.variable('SCF DIPOLE Z')]),
                'homo_energy': wfn.epsilon_a().np[wfn.nalpha()-1],
                'lumo_energy': wfn.epsilon_a().np[wfn.nalpha()],
                'atoms': atoms,
                'coordinates': coords
            }
            
            results['homo_lumo_gap'] = results['lumo_energy'] - results['homo_energy']
            
            self.calculation_history.append(results)
            return results
            
        except Exception as e:
            print(f"DFT calculation failed: {e}")
            return self._mock_calculation('DFT', smiles, basis, functional=functional)
    
    def calculate_mp2(self, smiles, basis='6-31G*', charge=0, multiplicity=1):
        """
        Perform MP2 calculation
        """
        if not self.psi4_available:
            return self._mock_calculation('MP2', smiles, basis)
        
        try:
            # Generate geometry
            atoms, coords = self.smiles_to_geometry(smiles)
            molecule = self.build_psi4_molecule(atoms, coords, charge, multiplicity)
            
            # Set basis set
            psi4.set_options({'basis': basis})
            
            # Run MP2 calculation
            energy = psi4.energy('mp2')
            
            results = {
                'method': 'MP2',
                'basis': basis,
                'energy': energy,
                'correlation_energy': psi4.core.variable('MP2 CORRELATION ENERGY'),
                'atoms': atoms,
                'coordinates': coords
            }
            
            self.calculation_history.append(results)
            return results
            
        except Exception as e:
            print(f"MP2 calculation failed: {e}")
            return self._mock_calculation('MP2', smiles, basis)
    
    def _mock_calculation(self, method, smiles, basis, functional=None):
        """
        Generate mock results for demonstration when Psi4 unavailable
        """
        mol = Chem.MolFromSmiles(smiles)
        num_atoms = mol.GetNumAtoms()
        num_electrons = sum([atom.GetAtomicNum() for atom in mol.GetAtoms()])
        
        # Generate realistic-looking mock data
        np.random.seed(hash(smiles) % 2**32)
        
        base_energy = -num_electrons * 0.5 + np.random.normal(0, 0.1)
        
        atoms, coords = self.smiles_to_geometry(smiles)
        
        results = {
            'method': f'{method}-{functional}' if functional else method,
            'basis': basis,
            'energy': base_energy,
            'num_electrons': num_electrons,
            'num_orbitals': num_electrons // 2 + 10,
            'dipole': np.random.normal(0, 1, 3),
            'homo_energy': -0.2 + np.random.normal(0, 0.05),
            'lumo_energy': 0.1 + np.random.normal(0, 0.05),
            'atoms': atoms,
            'coordinates': coords,
            'mock_data': True
        }
        
        results['homo_lumo_gap'] = results['lumo_energy'] - results['homo_energy']
        
        if method == 'MP2':
            results['correlation_energy'] = np.random.normal(-0.1, 0.02)
        
        self.calculation_history.append(results)
        return results
    
    def geometry_optimization(self, smiles, method='B3LYP', basis='6-31G*'):
        """
        Perform geometry optimization
        """
        if not self.psi4_available:
            return self._mock_optimization(smiles, method, basis)
        
        try:
            atoms, coords = self.smiles_to_geometry(smiles, optimize=False)
            molecule = self.build_psi4_molecule(atoms, coords)
            
            psi4.set_options({
                'basis': basis,
                'dft_functional': method if 'B3LYP' in method else 'B3LYP'
            })
            
            # Optimize geometry
            final_energy = psi4.optimize('dft')
            
            # Get optimized geometry
            opt_mol = psi4.core.get_active_molecule()
            opt_coords = np.array([[opt_mol.x(i), opt_mol.y(i), opt_mol.z(i)] 
                                  for i in range(opt_mol.natom())])
            
            results = {
                'method': f'{method} optimization',
                'basis': basis,
                'final_energy': final_energy,
                'initial_coords': coords,
                'optimized_coords': opt_coords,
                'atoms': atoms,
                'converged': True
            }
            
            return results
            
        except Exception as e:
            print(f"Geometry optimization failed: {e}")
            return self._mock_optimization(smiles, method, basis)
    
    def _mock_optimization(self, smiles, method, basis):
        """
        Mock geometry optimization
        """
        atoms, coords = self.smiles_to_geometry(smiles, optimize=False)
        
        # Simulate small coordinate changes
        opt_coords = coords + np.random.normal(0, 0.1, coords.shape)
        
        return {
            'method': f'{method} optimization',
            'basis': basis,
            'final_energy': -len(atoms) * 0.5 + np.random.normal(0, 0.1),
            'initial_coords': coords,
            'optimized_coords': opt_coords,
            'atoms': atoms,
            'converged': True,
            'mock_data': True
        }
    
    def batch_calculate(self, smiles_list, methods=['HF', 'B3LYP'], basis='6-31G*'):
        """
        Perform batch calculations on multiple molecules
        """
        results = []
        
        for i, smiles in enumerate(smiles_list):
            print(f"Calculating molecule {i+1}/{len(smiles_list)}: {smiles}")
            
            mol_results = {'smiles': smiles}
            
            for method in methods:
                try:
                    if method == 'HF':
                        calc_result = self.calculate_hartree_fock(smiles, basis)
                    elif method in ['B3LYP', 'PBE0', 'M06-2X']:
                        calc_result = self.calculate_dft(smiles, method, basis)
                    elif method == 'MP2':
                        calc_result = self.calculate_mp2(smiles, basis)
                    
                    mol_results[f'{method}_energy'] = calc_result['energy']
                    mol_results[f'{method}_homo_lumo_gap'] = calc_result.get('homo_lumo_gap', None)
                    mol_results[f'{method}_dipole_magnitude'] = np.linalg.norm(calc_result.get('dipole', [0,0,0]))
                    
                except Exception as e:
                    print(f"Failed {method} for {smiles}: {e}")
                    mol_results[f'{method}_energy'] = None
            
            results.append(mol_results)
        
        return pd.DataFrame(results)
    
    def get_calculation_summary(self):
        """
        Get summary of all calculations performed
        """
        if not self.calculation_history:
            return "No calculations performed yet"
        
        summary = pd.DataFrame(self.calculation_history)
        return summary[['method', 'basis', 'energy', 'homo_lumo_gap']].describe()

# Initialize the quantum chemistry engine
qm_engine = QuantumChemistryEngine(memory_gb=2, num_threads=2)
print("✅ QuantumChemistryEngine initialized")

In [None]:
# Test the quantum chemistry engine with simple molecules
test_molecules = [
    'C',      # Methane
    'CC',     # Ethane
    'C=C',    # Ethylene
    'C#C',    # Acetylene
    'c1ccccc1' # Benzene
]

print("🧮 Testing Quantum Chemistry Calculations")
print("=" * 50)

# Single molecule test
methane_hf = qm_engine.calculate_hartree_fock('C', basis='6-31G')
print(f"Methane HF/6-31G Energy: {methane_hf['energy']:.6f} Hartree")
print(f"HOMO-LUMO Gap: {methane_hf['homo_lumo_gap']:.4f} Hartree")

methane_dft = qm_engine.calculate_dft('C', functional='B3LYP', basis='6-31G')
print(f"Methane B3LYP/6-31G Energy: {methane_dft['energy']:.6f} Hartree")

# Geometry optimization test
print("\n🔧 Geometry Optimization Test")
ethylene_opt = qm_engine.geometry_optimization('C=C', method='B3LYP', basis='6-31G')
print(f"Ethylene optimization converged: {ethylene_opt['converged']}")
print(f"Final energy: {ethylene_opt['final_energy']:.6f} Hartree")

# Batch calculation test
print("\n📊 Batch Calculation Test")
batch_results = qm_engine.batch_calculate(
    test_molecules[:3], 
    methods=['HF', 'B3LYP'], 
    basis='6-31G'
)

print("\nBatch Results Summary:")
print(batch_results[['smiles', 'HF_energy', 'B3LYP_energy', 'HF_homo_lumo_gap']].round(4))

---
# 🧠 **SECTION 2: Electronic Structure ML (90 minutes)**

## **Objectives**
- Build ML models to predict quantum properties
- Implement transfer learning for QM calculations
- Create uncertainty quantification for predictions
- Develop quantum property encoders

## **Key Components**
1. **ElectronicStructureML** - Core ML framework
2. **QuantumPropertyEncoder** - Feature engineering for QM
3. **TransferLearningQM** - Knowledge transfer between methods
4. **UncertaintyQuantifier** - Confidence estimation

In [None]:
import torch
import torch.nn as nn
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib
from scipy.stats import pearsonr
from rdkit.Chem import Descriptors, rdMolDescriptors
import deepchem as dc

class ElectronicStructureML:
    """
    Advanced ML framework for predicting quantum chemistry properties
    """
    
    def __init__(self, random_state=42):
        self.random_state = random_state
        self.models = {}
        self.scalers = {}
        self.training_history = []
        self.feature_names = []
        
        # Initialize different model types
        self.model_configs = {
            'random_forest': RandomForestRegressor(
                n_estimators=100, 
                random_state=random_state,
                n_jobs=-1
            ),
            'gradient_boosting': GradientBoostingRegressor(
                n_estimators=100,
                random_state=random_state
            ),
            'neural_network': self._create_neural_network(),
            'graph_conv': None  # Will be created when needed
        }
    
    def _create_neural_network(self):
        """Create a PyTorch neural network for QM property prediction"""
        class QuantumNN(nn.Module):
            def __init__(self, input_dim, hidden_dims=[256, 128, 64], dropout=0.2):
                super().__init__()
                layers = []
                prev_dim = input_dim
                
                for hidden_dim in hidden_dims:
                    layers.extend([
                        nn.Linear(prev_dim, hidden_dim),
                        nn.ReLU(),
                        nn.BatchNorm1d(hidden_dim),
                        nn.Dropout(dropout)
                    ])
                    prev_dim = hidden_dim
                
                layers.append(nn.Linear(prev_dim, 1))
                self.network = nn.Sequential(*layers)
            
            def forward(self, x):
                return self.network(x)
        
        return QuantumNN
    
    def extract_quantum_features(self, smiles_list):
        """
        Extract comprehensive molecular features for QM property prediction
        """
        features = []
        feature_names = []
        
        for smiles in smiles_list:
            mol = Chem.MolFromSmiles(smiles)
            if mol is None:
                features.append([0] * 50)  # Default features
                continue
                
            mol_features = []
            
            # Basic molecular descriptors
            mol_features.extend([
                Descriptors.MolWt(mol),
                Descriptors.MolLogP(mol),
                Descriptors.NumHDonors(mol),
                Descriptors.NumHAcceptors(mol),
                Descriptors.TPSA(mol),
                Descriptors.NumRotatableBonds(mol),
                Descriptors.NumAromaticRings(mol),
                Descriptors.NumSaturatedRings(mol),
                Descriptors.RingCount(mol),
                Descriptors.FractionCsp3(mol),
            ])
            
            # Electronic descriptors
            mol_features.extend([
                Descriptors.BalabanJ(mol),
                Descriptors.Chi0n(mol),
                Descriptors.Chi1n(mol),
                Descriptors.HallKierAlpha(mol),
                Descriptors.Kappa1(mol),
                Descriptors.Kappa2(mol),
                Descriptors.Kappa3(mol),
            ])
            
            # Connectivity indices
            mol_features.extend([
                rdMolDescriptors.BertzCT(mol),
                Descriptors.LabuteASA(mol),
                Descriptors.EState_VSA1(mol),
                Descriptors.EState_VSA2(mol),
                Descriptors.VSA_EState1(mol),
                Descriptors.VSA_EState2(mol),
            ])
            
            # Quantum-relevant descriptors
            mol_features.extend([
                Descriptors.MaxAbsPartialCharge(mol),
                Descriptors.MaxPartialCharge(mol),
                Descriptors.MinAbsPartialCharge(mol),
                Descriptors.MinPartialCharge(mol),
                Descriptors.NumHeteroatoms(mol),
                Descriptors.NumRadicalElectrons(mol),
                Descriptors.NumValenceElectrons(mol),
            ])
            
            # Pad to fixed length
            while len(mol_features) < 50:
                mol_features.append(0.0)
            
            features.append(mol_features[:50])
        
        # Generate feature names if first time
        if not self.feature_names:
            self.feature_names = [f'qm_feature_{i}' for i in range(50)]
        
        return np.array(features)
    
    def prepare_training_data(self, qm_results_df):
        """
        Prepare training data from quantum chemistry results
        """
        # Extract features
        X = self.extract_quantum_features(qm_results_df['smiles'].tolist())
        
        # Prepare targets (multiple properties)
        targets = {}
        
        # Energy-based targets
        if 'HF_energy' in qm_results_df.columns:
            targets['hf_energy'] = qm_results_df['HF_energy'].values
        if 'B3LYP_energy' in qm_results_df.columns:
            targets['dft_energy'] = qm_results_df['B3LYP_energy'].values
        
        # Electronic properties
        if 'HF_homo_lumo_gap' in qm_results_df.columns:
            targets['homo_lumo_gap'] = qm_results_df['HF_homo_lumo_gap'].values
        
        # Calculate additional derived properties
        if 'HF_energy' in qm_results_df.columns and 'B3LYP_energy' in qm_results_df.columns:
            targets['correlation_energy'] = qm_results_df['B3LYP_energy'].values - qm_results_df['HF_energy'].values
        
        return X, targets
    
    def train_models(self, X, targets, test_size=0.2):
        """
        Train multiple ML models for quantum property prediction
        """
        results = {}
        
        for target_name, y in targets.items():
            print(f"\n🎯 Training models for {target_name}")
            print("-" * 40)
            
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=test_size, random_state=self.random_state
            )
            
            # Scale features
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)
            self.scalers[target_name] = scaler
            
            target_results = {}
            
            # Train traditional ML models
            for model_name, model in self.model_configs.items():
                if model_name == 'neural_network' or model_name == 'graph_conv':
                    continue
                    
                print(f"Training {model_name}...")
                model.fit(X_train_scaled, y_train)
                
                # Predictions
                y_pred = model.predict(X_test_scaled)
                
                # Metrics
                mae = mean_absolute_error(y_test, y_pred)
                r2 = r2_score(y_test, y_pred)
                correlation, _ = pearsonr(y_test, y_pred)
                
                target_results[model_name] = {
                    'model': model,
                    'mae': mae,
                    'r2': r2,
                    'correlation': correlation,
                    'predictions': y_pred,
                    'true_values': y_test
                }
                
                print(f"  MAE: {mae:.4f}, R²: {r2:.4f}, Correlation: {correlation:.4f}")
            
            # Train neural network
            nn_result = self._train_neural_network(X_train_scaled, y_train, X_test_scaled, y_test)
            target_results['neural_network'] = nn_result
            
            results[target_name] = target_results
            self.models[target_name] = target_results
        
        return results
    
    def _train_neural_network(self, X_train, y_train, X_test, y_test, epochs=100):
        """Train PyTorch neural network"""
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # Convert to tensors
        X_train_tensor = torch.FloatTensor(X_train).to(device)
        y_train_tensor = torch.FloatTensor(y_train.reshape(-1, 1)).to(device)
        X_test_tensor = torch.FloatTensor(X_test).to(device)
        y_test_tensor = torch.FloatTensor(y_test.reshape(-1, 1)).to(device)
        
        # Create model
        model = self.model_configs['neural_network'](X_train.shape[1]).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        criterion = nn.MSELoss()
        
        # Training loop
        model.train()
        train_losses = []
        
        for epoch in range(epochs):
            optimizer.zero_grad()
            outputs = model(X_train_tensor)
            loss = criterion(outputs, y_train_tensor)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
        
        # Evaluation
        model.eval()
        with torch.no_grad():
            y_pred_tensor = model(X_test_tensor)
            y_pred = y_pred_tensor.cpu().numpy().flatten()
        
        # Metrics
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        correlation, _ = pearsonr(y_test, y_pred)
        
        print(f"Training neural_network...")
        print(f"  MAE: {mae:.4f}, R²: {r2:.4f}, Correlation: {correlation:.4f}")
        
        return {
            'model': model,
            'mae': mae,
            'r2': r2,
            'correlation': correlation,
            'predictions': y_pred,
            'true_values': y_test,
            'train_losses': train_losses
        }
    
    def predict_properties(self, smiles_list, target_name, model_name='random_forest'):
        """
        Predict quantum properties for new molecules
        """
        if target_name not in self.models:
            raise ValueError(f"No trained model for {target_name}")
        
        # Extract features
        X = self.extract_quantum_features(smiles_list)
        X_scaled = self.scalers[target_name].transform(X)
        
        # Get model
        model = self.models[target_name][model_name]['model']
        
        # Predict
        if model_name == 'neural_network':
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            X_tensor = torch.FloatTensor(X_scaled).to(device)
            model.eval()
            with torch.no_grad():
                predictions = model(X_tensor).cpu().numpy().flatten()
        else:
            predictions = model.predict(X_scaled)
        
        return predictions
    
    def visualize_performance(self, target_name):
        """
        Create comprehensive visualization of model performance
        """
        if target_name not in self.models:
            print(f"No results for {target_name}")
            return
        
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        fig.suptitle(f'Model Performance for {target_name}', fontsize=16, fontweight='bold')
        
        models_to_plot = ['random_forest', 'gradient_boosting', 'neural_network']
        colors = ['blue', 'green', 'red']
        
        # Performance comparison
        ax = axes[0, 0]
        metrics = ['mae', 'r2', 'correlation']
        model_names = []
        mae_scores = []
        r2_scores = []
        corr_scores = []
        
        for model_name in models_to_plot:
            if model_name in self.models[target_name]:
                model_names.append(model_name.replace('_', ' ').title())
                mae_scores.append(self.models[target_name][model_name]['mae'])
                r2_scores.append(self.models[target_name][model_name]['r2'])
                corr_scores.append(self.models[target_name][model_name]['correlation'])
        
        x = np.arange(len(model_names))
        width = 0.25
        
        ax.bar(x - width, mae_scores, width, label='MAE', alpha=0.8)
        ax.bar(x, r2_scores, width, label='R²', alpha=0.8)
        ax.bar(x + width, corr_scores, width, label='Correlation', alpha=0.8)
        
        ax.set_xlabel('Models')
        ax.set_ylabel('Score')
        ax.set_title('Performance Metrics Comparison')
        ax.set_xticks(x)
        ax.set_xticklabels(model_names, rotation=45)
        ax.legend()
        ax.grid(True, alpha=0.3)
        
        # Prediction vs True plots
        for i, (model_name, color) in enumerate(zip(models_to_plot[:2], colors[:2])):
            if model_name not in self.models[target_name]:
                continue
                
            ax = axes[0, 1] if i == 0 else axes[1, 0]
            
            true_vals = self.models[target_name][model_name]['true_values']
            pred_vals = self.models[target_name][model_name]['predictions']
            
            ax.scatter(true_vals, pred_vals, alpha=0.6, color=color, s=30)
            
            # Perfect prediction line
            min_val = min(true_vals.min(), pred_vals.min())
            max_val = max(true_vals.max(), pred_vals.max())
            ax.plot([min_val, max_val], [min_val, max_val], 'k--', alpha=0.8)
            
            ax.set_xlabel('True Values')
            ax.set_ylabel('Predicted Values')
            ax.set_title(f'{model_name.replace("_", " ").title()} - R² = {self.models[target_name][model_name]["r2"]:.3f}')
            ax.grid(True, alpha=0.3)
        
        # Neural network training curve
        if 'neural_network' in self.models[target_name]:
            ax = axes[1, 1]
            train_losses = self.models[target_name]['neural_network']['train_losses']
            ax.plot(train_losses, color='red', alpha=0.8)
            ax.set_xlabel('Epoch')
            ax.set_ylabel('Training Loss')
            ax.set_title('Neural Network Training Curve')
            ax.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()

# Initialize Electronic Structure ML framework
qm_ml = ElectronicStructureML(random_state=42)
print("✅ ElectronicStructureML framework initialized")

In [None]:
class QuantumPropertyEncoder:
    """
    Advanced feature engineering specifically for quantum chemistry properties
    """
    
    def __init__(self):
        self.atomic_features = {
            'H': [1, 1, 1, 0.31], 'C': [6, 4, 2, 0.76], 'N': [7, 3, 5, 0.71],
            'O': [8, 2, 6, 0.66], 'F': [9, 1, 7, 0.57], 'P': [15, 3, 5, 1.07],
            'S': [16, 2, 6, 1.05], 'Cl': [17, 1, 7, 0.99], 'Br': [35, 1, 7, 1.14],
            'I': [53, 1, 7, 1.33]  # [atomic_number, valence, electrons, radius]
        }
        self.bond_features = {
            1: [1, 347], 2: [2, 614], 3: [3, 839], 12: [1.5, 518]  # [order, strength]
        }
    
    def encode_molecular_graph(self, smiles):
        """
        Create quantum-aware molecular graph encoding
        """
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return np.zeros(100)  # Default encoding
        
        # Atom-level features
        atom_features = []
        for atom in mol.GetAtoms():
            symbol = atom.GetSymbol()
            atomic_info = self.atomic_features.get(symbol, [0, 0, 0, 0])
            
            features = [
                atomic_info[0],  # Atomic number
                atomic_info[1],  # Valence electrons
                atomic_info[2],  # Total electrons
                atomic_info[3],  # Atomic radius
                atom.GetFormalCharge(),
                atom.GetHybridization().real,
                atom.GetIsAromatic(),
                atom.IsInRing(),
                atom.GetTotalNumHs(),
                atom.GetDegree()
            ]
            atom_features.append(features)
        
        # Aggregate atom features
        atom_matrix = np.array(atom_features)
        atom_aggregated = [
            atom_matrix.mean(axis=0),
            atom_matrix.std(axis=0),
            atom_matrix.max(axis=0),
            atom_matrix.min(axis=0)
        ]
        atom_encoding = np.concatenate(atom_aggregated).flatten()
        
        # Bond-level features
        bond_features = []
        for bond in mol.GetBonds():
            bond_type = bond.GetBondTypeAsDouble()
            bond_info = self.bond_features.get(int(bond_type), [0, 0])
            
            features = [
                bond_info[0],  # Bond order
                bond_info[1],  # Bond strength
                bond.GetIsAromatic(),
                bond.IsInRing(),
                bond.GetIsConjugated()
            ]
            bond_features.append(features)
        
        # Aggregate bond features
        if bond_features:
            bond_matrix = np.array(bond_features)
            bond_aggregated = [
                bond_matrix.mean(axis=0),
                bond_matrix.std(axis=0),
                bond_matrix.max(axis=0),
                bond_matrix.min(axis=0)
            ]
            bond_encoding = np.concatenate(bond_aggregated).flatten()
        else:
            bond_encoding = np.zeros(20)
        
        # Combine encodings
        full_encoding = np.concatenate([atom_encoding, bond_encoding])
        
        # Pad or truncate to fixed size
        if len(full_encoding) < 100:
            full_encoding = np.pad(full_encoding, (0, 100 - len(full_encoding)))
        else:
            full_encoding = full_encoding[:100]
        
        return full_encoding
    
    def encode_quantum_environment(self, smiles, basis_set='6-31G', method='B3LYP'):
        """
        Encode computational environment features
        """
        # Basis set encoding
        basis_encoding = {
            'STO-3G': [1, 0, 0], '3-21G': [0, 1, 0], '6-31G': [0, 0, 1],
            '6-31G*': [0, 0, 2], '6-31+G*': [0, 0, 3], 'cc-pVDZ': [1, 1, 1]
        }
        basis_features = basis_encoding.get(basis_set, [0, 0, 0])
        
        # Method encoding
        method_encoding = {
            'HF': [1, 0, 0, 0], 'B3LYP': [0, 1, 0, 0], 'PBE': [0, 0, 1, 0],
            'M06-2X': [0, 0, 0, 1], 'wB97X-D': [0, 0, 0, 2]
        }
        method_features = method_encoding.get(method, [0, 0, 0, 0])
        
        # Molecular complexity indicators
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            complexity_features = [
                mol.GetNumAtoms(),
                mol.GetNumBonds(),
                len(Chem.GetSymmSSSR(mol)),  # Ring count
                Descriptors.BertzCT(mol),     # Complexity index
                mol.GetNumHeavyAtoms()
            ]
        else:
            complexity_features = [0, 0, 0, 0, 0]
        
        return np.array(basis_features + method_features + complexity_features)

# Initialize quantum property encoder
qp_encoder = QuantumPropertyEncoder()
print("✅ QuantumPropertyEncoder initialized")

# Test encoding capabilities
test_smiles = ['CCO', 'c1ccccc1', 'CC(=O)O']
print("\n🧬 Testing Quantum Property Encoding")
print("=" * 45)

for smiles in test_smiles:
    graph_encoding = qp_encoder.encode_molecular_graph(smiles)
    env_encoding = qp_encoder.encode_quantum_environment(smiles, '6-31G', 'B3LYP')
    
    print(f"SMILES: {smiles}")
    print(f"  Graph encoding shape: {graph_encoding.shape}")
    print(f"  Environment encoding shape: {env_encoding.shape}")
    print(f"  Total features: {len(graph_encoding) + len(env_encoding)}")

In [None]:
# Demonstrate Electronic Structure ML with real quantum data
print("🤖 Training Electronic Structure ML Models")
print("=" * 50)

# Generate expanded training data with more molecules
extended_molecules = [
    'C', 'CC', 'CCC', 'CCCC',  # Alkanes
    'C=C', 'CC=C', 'C=CC=C',   # Alkenes
    'C#C', 'CC#C',             # Alkynes
    'CO', 'CCO', 'CCCO',       # Alcohols
    'C=O', 'CC=O', 'CCC=O',    # Aldehydes/Ketones
    'c1ccccc1', 'c1ccc(C)cc1', # Aromatics
    'CCN', 'CCCN', 'NC',       # Amines
    'CS', 'CCS', 'CCCS',       # Thiols
    'CF', 'CCF', 'CCCF',       # Fluorides
    'C(F)(F)F', 'CC(F)(F)F'    # Fluorocarbons
]

print(f"Calculating quantum properties for {len(extended_molecules)} molecules...")

# Batch calculate quantum properties
training_data = qm_engine.batch_calculate(
    extended_molecules, 
    methods=['HF', 'B3LYP'], 
    basis='6-31G'
)

print(f"✅ Generated {len(training_data)} quantum calculations")

# Prepare ML training data
X, targets = qm_ml.prepare_training_data(training_data)
print(f"Feature matrix shape: {X.shape}")
print(f"Available targets: {list(targets.keys())}")

# Train models for all available targets
training_results = qm_ml.train_models(X, targets, test_size=0.3)

print("\n📊 Training Results Summary")
print("=" * 50)

for target_name, target_results in training_results.items():
    print(f"\n{target_name.upper()}:")
    for model_name, metrics in target_results.items():
        if isinstance(metrics, dict) and 'r2' in metrics:
            print(f"  {model_name:15s}: R² = {metrics['r2']:.3f}, MAE = {metrics['mae']:.4f}")

In [None]:
# Advanced prediction and uncertainty quantification
class UncertaintyQuantifier:
    """
    Quantify prediction uncertainty for quantum properties
    """
    
    def __init__(self, qm_ml_model):
        self.qm_ml = qm_ml_model
        self.bootstrap_models = {}
        self.n_bootstrap = 10
    
    def fit_bootstrap_ensemble(self, X, targets, target_name):
        """
        Create bootstrap ensemble for uncertainty estimation
        """
        bootstrap_models = []
        
        print(f"Creating bootstrap ensemble for {target_name}...")
        
        for i in range(self.n_bootstrap):
            # Bootstrap sample
            n_samples = len(X)
            bootstrap_indices = np.random.choice(n_samples, n_samples, replace=True)
            X_bootstrap = X[bootstrap_indices]
            y_bootstrap = targets[target_name][bootstrap_indices]
            
            # Train model on bootstrap sample
            model = RandomForestRegressor(n_estimators=50, random_state=i)
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X_bootstrap)
            model.fit(X_scaled, y_bootstrap)
            
            bootstrap_models.append((model, scaler))
        
        self.bootstrap_models[target_name] = bootstrap_models
        print(f"✅ Bootstrap ensemble ready ({self.n_bootstrap} models)")
    
    def predict_with_uncertainty(self, smiles_list, target_name):
        """
        Predict with uncertainty bounds
        """
        if target_name not in self.bootstrap_models:
            raise ValueError(f"No bootstrap ensemble for {target_name}")
        
        # Extract features
        X = self.qm_ml.extract_quantum_features(smiles_list)
        
        # Collect predictions from all bootstrap models
        predictions = []
        
        for model, scaler in self.bootstrap_models[target_name]:
            X_scaled = scaler.transform(X)
            pred = model.predict(X_scaled)
            predictions.append(pred)
        
        predictions = np.array(predictions)
        
        # Calculate statistics
        mean_pred = predictions.mean(axis=0)
        std_pred = predictions.std(axis=0)
        
        # Confidence intervals (assuming normal distribution)
        ci_lower = mean_pred - 1.96 * std_pred  # 95% CI
        ci_upper = mean_pred + 1.96 * std_pred
        
        return {
            'predictions': mean_pred,
            'uncertainty': std_pred,
            'ci_lower': ci_lower,
            'ci_upper': ci_upper,
            'all_predictions': predictions
        }

# Initialize uncertainty quantification
uncertainty_quantifier = UncertaintyQuantifier(qm_ml)

# Create uncertainty models for available targets
for target_name in targets.keys():
    uncertainty_quantifier.fit_bootstrap_ensemble(X, targets, target_name)

print("\n🎯 Testing Uncertainty Quantification")
print("=" * 45)

# Test molecules for uncertainty prediction
test_molecules_uncertainty = ['CCCCCC', 'c1ccc(O)cc1', 'CC(C)C', 'C1CCC1']

for target_name in list(targets.keys())[:2]:  # Test first two targets
    print(f"\nPredicting {target_name} with uncertainty:")
    
    uncertainty_results = uncertainty_quantifier.predict_with_uncertainty(
        test_molecules_uncertainty, 
        target_name
    )
    
    for i, smiles in enumerate(test_molecules_uncertainty):
        pred = uncertainty_results['predictions'][i]
        unc = uncertainty_results['uncertainty'][i]
        ci_low = uncertainty_results['ci_lower'][i]
        ci_high = uncertainty_results['ci_upper'][i]
        
        print(f"  {smiles:12s}: {pred:8.4f} ± {unc:6.4f} [{ci_low:7.4f}, {ci_high:7.4f}]")

In [None]:
# Comprehensive visualization of Electronic Structure ML results
print("📊 Visualizing Electronic Structure ML Performance")
print("=" * 55)

# Visualize performance for each target
for target_name in training_results.keys():
    print(f"\nGenerating plots for {target_name}...")
    qm_ml.visualize_performance(target_name)

# Feature importance analysis
def analyze_feature_importance(target_name='hf_energy'):
    """
    Analyze which molecular features are most important for predictions
    """
    if target_name not in qm_ml.models:
        print(f"No model available for {target_name}")
        return
    
    # Get random forest model for feature importance
    rf_model = qm_ml.models[target_name]['random_forest']['model']
    feature_importance = rf_model.feature_importances_
    
    # Create feature importance plot
    plt.figure(figsize=(12, 8))
    
    # Top 20 most important features
    top_indices = np.argsort(feature_importance)[-20:]
    top_importance = feature_importance[top_indices]
    top_features = [f'Feature_{i}' for i in top_indices]
    
    plt.barh(range(len(top_importance)), top_importance)
    plt.yticks(range(len(top_importance)), top_features)
    plt.xlabel('Feature Importance')
    plt.title(f'Top 20 Feature Importance for {target_name}')
    plt.gca().invert_yaxis()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    return top_indices, top_importance

# Analyze feature importance
print("\n🔍 Feature Importance Analysis")
if 'hf_energy' in training_results:
    top_features, importance_scores = analyze_feature_importance('hf_energy')
    print(f"Most important feature index: {top_features[-1]} (importance: {importance_scores[-1]:.4f})")

print("\n✅ Section 2: Electronic Structure ML completed!")
print("🎯 Key Achievements:")
print("   • Built comprehensive ML framework for quantum properties")
print("   • Implemented multiple model types (RF, GB, NN)")
print("   • Created quantum-aware feature engineering")
print("   • Developed uncertainty quantification")
print("   • Generated performance visualizations")

---
# ⚙️ **SECTION 3: QM Data Pipeline (90 minutes)**

## **Objectives**
- Build automated quantum calculation workflows
- Implement high-throughput screening systems
- Create database integration for QM results
- Develop parallel processing frameworks

## **Key Components**
1. **QMDataPipeline** - Automated calculation workflows
2. **HighThroughputQM** - Parallel processing system
3. **QMDatabaseManager** - Data storage and retrieval
4. **WorkflowOrchestrator** - Complex calculation sequences

In [None]:
import sqlite3
import asyncio
import concurrent.futures
from datetime import datetime
import json
import hashlib
from dataclasses import dataclass, asdict
from typing import List, Dict, Optional, Any
import pickle
import time

@dataclass
class QMCalculationRequest:
    """Data structure for quantum calculation requests"""
    smiles: str
    method: str
    basis: str
    task_type: str  # 'energy', 'optimization', 'frequency'
    charge: int = 0
    multiplicity: int = 1
    solvent: Optional[str] = None
    priority: int = 1
    metadata: Dict[str, Any] = None
    
    def __post_init__(self):
        if self.metadata is None:
            self.metadata = {}
        
        # Generate unique ID for the calculation
        content = f"{self.smiles}_{self.method}_{self.basis}_{self.task_type}_{self.charge}_{self.multiplicity}_{self.solvent}"
        self.calculation_id = hashlib.md5(content.encode()).hexdigest()

@dataclass
class QMCalculationResult:
    """Data structure for quantum calculation results"""
    calculation_id: str
    request: QMCalculationRequest
    success: bool
    energy: Optional[float] = None
    homo_lumo_gap: Optional[float] = None
    dipole_moment: Optional[float] = None
    optimized_geometry: Optional[List[List[float]]] = None
    frequencies: Optional[List[float]] = None
    error_message: Optional[str] = None
    computation_time: Optional[float] = None
    timestamp: Optional[str] = None
    
    def __post_init__(self):
        if self.timestamp is None:
            self.timestamp = datetime.now().isoformat()

class QMDatabaseManager:
    """
    Database manager for quantum chemistry calculations
    """
    
    def __init__(self, db_path="qm_calculations.db"):
        self.db_path = db_path
        self.init_database()
    
    def init_database(self):
        """Initialize SQLite database with proper schema"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        # Calculation requests table
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS calculation_requests (
                calculation_id TEXT PRIMARY KEY,
                smiles TEXT NOT NULL,
                method TEXT NOT NULL,
                basis TEXT NOT NULL,
                task_type TEXT NOT NULL,
                charge INTEGER DEFAULT 0,
                multiplicity INTEGER DEFAULT 1,
                solvent TEXT,
                priority INTEGER DEFAULT 1,
                metadata TEXT,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                status TEXT DEFAULT 'pending'
            )
        ''')
        
        # Calculation results table
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS calculation_results (
                calculation_id TEXT PRIMARY KEY,
                success BOOLEAN NOT NULL,
                energy REAL,
                homo_lumo_gap REAL,
                dipole_moment REAL,
                optimized_geometry TEXT,
                frequencies TEXT,
                error_message TEXT,
                computation_time REAL,
                completed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                FOREIGN KEY (calculation_id) REFERENCES calculation_requests (calculation_id)
            )
        ''')
        
        # Performance tracking table
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS performance_metrics (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                method TEXT NOT NULL,
                basis TEXT NOT NULL,
                avg_computation_time REAL,
                success_rate REAL,
                total_calculations INTEGER,
                last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        ''')
        
        conn.commit()
        conn.close()
    
    def add_calculation_request(self, request: QMCalculationRequest):
        """Add a new calculation request to the database"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        cursor.execute('''
            INSERT OR REPLACE INTO calculation_requests 
            (calculation_id, smiles, method, basis, task_type, charge, multiplicity, 
             solvent, priority, metadata, status)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'pending')
        ''', (
            request.calculation_id, request.smiles, request.method, request.basis,
            request.task_type, request.charge, request.multiplicity, request.solvent,
            request.priority, json.dumps(request.metadata)
        ))
        
        conn.commit()
        conn.close()
    
    def store_calculation_result(self, result: QMCalculationResult):
        """Store calculation result in the database"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        # Update request status
        cursor.execute('''
            UPDATE calculation_requests 
            SET status = ? 
            WHERE calculation_id = ?
        ''', ('completed' if result.success else 'failed', result.calculation_id))
        
        # Store result
        cursor.execute('''
            INSERT OR REPLACE INTO calculation_results 
            (calculation_id, success, energy, homo_lumo_gap, dipole_moment,
             optimized_geometry, frequencies, error_message, computation_time)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
        ''', (
            result.calculation_id, result.success, result.energy, result.homo_lumo_gap,
            result.dipole_moment, json.dumps(result.optimized_geometry),
            json.dumps(result.frequencies), result.error_message, result.computation_time
        ))
        
        conn.commit()
        conn.close()
    
    def get_pending_calculations(self, limit=None):
        """Retrieve pending calculations ordered by priority"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        query = '''
            SELECT * FROM calculation_requests 
            WHERE status = 'pending' 
            ORDER BY priority DESC, created_at ASC
        '''
        
        if limit:
            query += f' LIMIT {limit}'
        
        cursor.execute(query)
        rows = cursor.fetchall()
        conn.close()
        
        # Convert to QMCalculationRequest objects
        requests = []
        for row in rows:
            metadata = json.loads(row[9]) if row[9] else {}
            request = QMCalculationRequest(
                smiles=row[1], method=row[2], basis=row[3], task_type=row[4],
                charge=row[5], multiplicity=row[6], solvent=row[7],
                priority=row[8], metadata=metadata
            )
            request.calculation_id = row[0]
            requests.append(request)
        
        return requests
    
    def get_calculation_results(self, calculation_ids=None):
        """Retrieve calculation results"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        if calculation_ids:
            placeholders = ','.join(['?' for _ in calculation_ids])
            cursor.execute(f'''
                SELECT r.*, req.smiles, req.method, req.basis, req.task_type
                FROM calculation_results r
                JOIN calculation_requests req ON r.calculation_id = req.calculation_id
                WHERE r.calculation_id IN ({placeholders})
            ''', calculation_ids)
        else:
            cursor.execute('''
                SELECT r.*, req.smiles, req.method, req.basis, req.task_type
                FROM calculation_results r
                JOIN calculation_requests req ON r.calculation_id = req.calculation_id
            ''')
        
        rows = cursor.fetchall()
        conn.close()
        
        return rows
    
    def get_performance_summary(self):
        """Get performance summary statistics"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        cursor.execute('''
            SELECT 
                method, basis,
                COUNT(*) as total_calculations,
                AVG(computation_time) as avg_time,
                SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) * 100.0 / COUNT(*) as success_rate
            FROM calculation_results r
            JOIN calculation_requests req ON r.calculation_id = req.calculation_id
            GROUP BY method, basis
        ''')
        
        results = cursor.fetchall()
        conn.close()
        
        return results

class HighThroughputQM:
    """
    High-throughput quantum chemistry calculation system
    """
    
    def __init__(self, qm_engine, db_manager, max_workers=4):
        self.qm_engine = qm_engine
        self.db_manager = db_manager
        self.max_workers = max_workers
        self.active_calculations = {}
        
    def submit_calculation_batch(self, smiles_list, methods, basis_sets, task_types=['energy']):
        """
        Submit a batch of calculations to the pipeline
        """
        requests = []
        
        for smiles in smiles_list:
            for method in methods:
                for basis in basis_sets:
                    for task_type in task_types:
                        request = QMCalculationRequest(
                            smiles=smiles,
                            method=method,
                            basis=basis,
                            task_type=task_type
                        )
                        requests.append(request)
                        self.db_manager.add_calculation_request(request)
        
        print(f"✅ Submitted {len(requests)} calculations to pipeline")
        return requests
    
    def process_calculation_queue(self, max_calculations=None):
        """
        Process pending calculations using parallel execution
        """
        pending = self.db_manager.get_pending_calculations(limit=max_calculations)
        
        if not pending:
            print("No pending calculations found")
            return []
        
        print(f"🚀 Processing {len(pending)} calculations with {self.max_workers} workers...")
        
        results = []
        with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            # Submit all calculations
            future_to_request = {
                executor.submit(self._execute_calculation, request): request 
                for request in pending
            }
            
            # Collect results as they complete
            for future in concurrent.futures.as_completed(future_to_request):
                request = future_to_request[future]
                try:
                    result = future.result()
                    self.db_manager.store_calculation_result(result)
                    results.append(result)
                    
                    if result.success:
                        print(f"✅ {request.smiles} ({request.method}/{request.basis})")
                    else:
                        print(f"❌ {request.smiles} ({request.method}/{request.basis}): {result.error_message}")
                        
                except Exception as e:
                    print(f"💥 Exception for {request.smiles}: {str(e)}")
        
        success_count = sum(1 for r in results if r.success)
        print(f"🎯 Completed: {success_count}/{len(results)} successful")
        
        return results
    
    def _execute_calculation(self, request: QMCalculationRequest):
        """
        Execute a single quantum calculation
        """
        start_time = time.time()
        
        try:
            if request.task_type == 'energy':
                if request.method.upper() == 'HF':
                    calc_result = self.qm_engine.calculate_hartree_fock(
                        request.smiles, basis=request.basis
                    )
                else:
                    calc_result = self.qm_engine.calculate_dft(
                        request.smiles, functional=request.method, basis=request.basis
                    )
                
                result = QMCalculationResult(
                    calculation_id=request.calculation_id,
                    request=request,
                    success=True,
                    energy=calc_result['energy'],
                    homo_lumo_gap=calc_result.get('homo_lumo_gap'),
                    dipole_moment=calc_result.get('dipole_moment'),
                    computation_time=time.time() - start_time
                )
                
            elif request.task_type == 'optimization':
                calc_result = self.qm_engine.geometry_optimization(
                    request.smiles, method=request.method, basis=request.basis
                )
                
                result = QMCalculationResult(
                    calculation_id=request.calculation_id,
                    request=request,
                    success=calc_result['converged'],
                    energy=calc_result.get('final_energy'),
                    optimized_geometry=calc_result.get('optimized_coords'),
                    computation_time=time.time() - start_time
                )
            
            else:
                raise ValueError(f"Unsupported task type: {request.task_type}")
                
        except Exception as e:
            result = QMCalculationResult(
                calculation_id=request.calculation_id,
                request=request,
                success=False,
                error_message=str(e),
                computation_time=time.time() - start_time
            )
        
        return result

# Initialize the QM data pipeline
print("🔧 Initializing QM Data Pipeline")
print("=" * 40)

db_manager = QMDatabaseManager("bootcamp_qm.db")
print("✅ Database manager initialized")

ht_qm = HighThroughputQM(qm_engine, db_manager, max_workers=2)
print("✅ High-throughput QM system initialized")

In [None]:
# Demonstrate the QM data pipeline with a comprehensive workflow
print("🧪 QM Data Pipeline Demonstration")
print("=" * 45)

# 1. Submit a diverse batch of calculations
demo_molecules = [
    'C', 'CC', 'CCC', 'CCCC',           # Alkanes
    'C=C', 'CC=C', 'C=CC=C',            # Alkenes  
    'c1ccccc1', 'c1ccc(C)cc1',          # Aromatics
    'CCO', 'CC(C)O', 'CCCO',            # Alcohols
    'C=O', 'CC=O', 'CCC=O',             # Carbonyls
    'CCN', 'CC(C)N', 'c1ccc(N)cc1',     # Amines
    'CC(=O)O', 'CCC(=O)O',              # Carboxylic acids
    'C1CCC1', 'C1CCCC1', 'C1CCCCC1'     # Cyclic compounds
]

print(f"Submitting calculations for {len(demo_molecules)} molecules...")

# Submit energy calculations
batch_requests = ht_qm.submit_calculation_batch(
    smiles_list=demo_molecules[:10],  # Use subset for demo
    methods=['HF', 'B3LYP'],
    basis_sets=['6-31G'],
    task_types=['energy']
)

print(f"Total requests submitted: {len(batch_requests)}")

# 2. Process the calculation queue
print("\n🚀 Processing calculation queue...")
calculation_results = ht_qm.process_calculation_queue(max_calculations=20)

# 3. Analyze the results
print("\n📊 Pipeline Results Analysis")
print("=" * 35)

successful_results = [r for r in calculation_results if r.success]
failed_results = [r for r in calculation_results if not r.success]

print(f"Successful calculations: {len(successful_results)}")
print(f"Failed calculations: {len(failed_results)}")

if successful_results:
    avg_time = np.mean([r.computation_time for r in successful_results])
    print(f"Average computation time: {avg_time:.2f} seconds")
    
    # Energy statistics
    hf_energies = [r.energy for r in successful_results if r.request.method == 'HF']
    dft_energies = [r.energy for r in successful_results if r.request.method == 'B3LYP']
    
    if hf_energies:
        print(f"HF energy range: [{min(hf_energies):.4f}, {max(hf_energies):.4f}] Hartree")
    if dft_energies:
        print(f"DFT energy range: [{min(dft_energies):.4f}, {max(dft_energies):.4f}] Hartree")

# 4. Database performance summary
print("\n💾 Database Performance Summary")
print("=" * 35)

performance_data = db_manager.get_performance_summary()
for method, basis, total, avg_time, success_rate in performance_data:
    print(f"{method}/{basis}: {total} calcs, {avg_time:.2f}s avg, {success_rate:.1f}% success")

In [None]:
class WorkflowOrchestrator:
    """
    Orchestrate complex quantum chemistry workflows
    """
    
    def __init__(self, ht_qm_system, ml_system):
        self.ht_qm = ht_qm_system
        self.ml_system = ml_system
        self.workflows = {}
        
    def create_property_prediction_workflow(self, name, molecules, target_properties):
        """
        Create a workflow for comprehensive property prediction
        """
        workflow = {
            'name': name,
            'molecules': molecules,
            'target_properties': target_properties,
            'stages': [
                'qm_calculations',
                'ml_training',
                'prediction_validation',
                'uncertainty_analysis'
            ],
            'results': {}
        }
        
        self.workflows[name] = workflow
        return workflow
    
    def execute_workflow(self, workflow_name):
        """
        Execute a complete workflow
        """
        if workflow_name not in self.workflows:
            raise ValueError(f"Workflow {workflow_name} not found")
        
        workflow = self.workflows[workflow_name]
        print(f"🔄 Executing workflow: {workflow['name']}")
        print("=" * 50)
        
        # Stage 1: QM Calculations
        print("📊 Stage 1: Quantum Chemistry Calculations")
        qm_requests = self.ht_qm.submit_calculation_batch(
            smiles_list=workflow['molecules'],
            methods=['HF', 'B3LYP'],
            basis_sets=['6-31G'],
            task_types=['energy']
        )
        
        qm_results = self.ht_qm.process_calculation_queue()
        workflow['results']['qm_calculations'] = qm_results
        print(f"✅ Completed {len(qm_results)} QM calculations")
        
        # Stage 2: ML Training
        print("\n🤖 Stage 2: Machine Learning Training")
        
        # Convert results to DataFrame for ML training
        qm_data = []
        for result in qm_results:
            if result.success:
                qm_data.append({
                    'smiles': result.request.smiles,
                    'method': result.request.method,
                    'energy': result.energy,
                    'homo_lumo_gap': result.homo_lumo_gap,
                    'computation_time': result.computation_time
                })
        
        if qm_data:
            qm_df = pd.DataFrame(qm_data)
            
            # Pivot for ML training
            pivot_df = qm_df.pivot_table(
                index='smiles', 
                columns='method', 
                values=['energy', 'homo_lumo_gap'],
                aggfunc='first'
            ).reset_index()
            
            # Flatten column names
            pivot_df.columns = ['_'.join(col).strip() if col[1] else col[0] for col in pivot_df.columns]
            
            # Train ML models if we have sufficient data
            if len(pivot_df) >= 5:
                X, targets = self.ml_system.prepare_training_data(pivot_df)
                ml_results = self.ml_system.train_models(X, targets, test_size=0.2)
                workflow['results']['ml_training'] = ml_results
                print(f"✅ Trained ML models on {len(pivot_df)} molecules")
            else:
                print("⚠️ Insufficient data for ML training")
                workflow['results']['ml_training'] = None
        
        # Stage 3: Prediction Validation
        print("\n🎯 Stage 3: Prediction Validation")
        if workflow['results']['ml_training']:
            # Test predictions on a subset
            test_molecules = workflow['molecules'][:3]
            for target in targets.keys():
                predictions = self.ml_system.predict_properties(
                    test_molecules, target, 'random_forest'
                )
                print(f"Predictions for {target}: {predictions[:3]}")
        
        # Stage 4: Uncertainty Analysis
        print("\n📈 Stage 4: Uncertainty Analysis")
        workflow['results']['uncertainty_analysis'] = {
            'qm_success_rate': len([r for r in qm_results if r.success]) / len(qm_results),
            'computation_time_stats': {
                'mean': np.mean([r.computation_time for r in qm_results if r.success]),
                'std': np.std([r.computation_time for r in qm_results if r.success])
            }
        }
        
        print(f"✅ Workflow '{workflow_name}' completed successfully")
        return workflow
    
    def visualize_workflow_results(self, workflow_name):
        """
        Create comprehensive visualization of workflow results
        """
        if workflow_name not in self.workflows:
            print(f"Workflow {workflow_name} not found")
            return
        
        workflow = self.workflows[workflow_name]
        results = workflow['results']
        
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        fig.suptitle(f'Workflow Results: {workflow_name}', fontsize=16, fontweight='bold')
        
        # QM calculation success rates
        ax = axes[0, 0]
        qm_results = results.get('qm_calculations', [])
        if qm_results:
            methods = {}
            for result in qm_results:
                method = result.request.method
                if method not in methods:
                    methods[method] = {'success': 0, 'total': 0}
                methods[method]['total'] += 1
                if result.success:
                    methods[method]['success'] += 1
            
            method_names = list(methods.keys())
            success_rates = [methods[m]['success']/methods[m]['total']*100 for m in method_names]
            
            ax.bar(method_names, success_rates, alpha=0.7)
            ax.set_ylabel('Success Rate (%)')
            ax.set_title('QM Calculation Success Rates')
            ax.set_ylim(0, 100)
        
        # Computation time distribution
        ax = axes[0, 1]
        if qm_results:
            comp_times = [r.computation_time for r in qm_results if r.success]
            if comp_times:
                ax.hist(comp_times, bins=10, alpha=0.7, edgecolor='black')
                ax.set_xlabel('Computation Time (s)')
                ax.set_ylabel('Frequency')
                ax.set_title('Computation Time Distribution')
        
        # Energy correlation (HF vs DFT)
        ax = axes[1, 0]
        hf_energies = []
        dft_energies = []
        molecules_with_both = {}
        
        for result in qm_results:
            if result.success:
                mol = result.request.smiles
                method = result.request.method
                if mol not in molecules_with_both:
                    molecules_with_both[mol] = {}
                molecules_with_both[mol][method] = result.energy
        
        for mol, energies in molecules_with_both.items():
            if 'HF' in energies and 'B3LYP' in energies:
                hf_energies.append(energies['HF'])
                dft_energies.append(energies['B3LYP'])
        
        if hf_energies and dft_energies:
            ax.scatter(hf_energies, dft_energies, alpha=0.6)
            ax.set_xlabel('HF Energy (Hartree)')
            ax.set_ylabel('DFT Energy (Hartree)')
            ax.set_title('HF vs DFT Energy Correlation')
            
            # Add correlation line
            if len(hf_energies) > 1:
                z = np.polyfit(hf_energies, dft_energies, 1)
                p = np.poly1d(z)
                ax.plot(hf_energies, p(hf_energies), "r--", alpha=0.8)
        
        # ML performance (if available)
        ax = axes[1, 1]
        ml_results = results.get('ml_training')
        if ml_results:
            target_names = []
            r2_scores = []
            
            for target, models in ml_results.items():
                if 'random_forest' in models:
                    target_names.append(target.replace('_', ' ').title())
                    r2_scores.append(models['random_forest']['r2'])
            
            if target_names:
                ax.bar(target_names, r2_scores, alpha=0.7)
                ax.set_ylabel('R² Score')
                ax.set_title('ML Model Performance')
                ax.set_ylim(0, 1)
                plt.setp(ax.get_xticklabels(), rotation=45, ha='right')
        
        plt.tight_layout()
        plt.show()

# Initialize workflow orchestrator
orchestrator = WorkflowOrchestrator(ht_qm, qm_ml)
print("✅ WorkflowOrchestrator initialized")

# Create and execute a demonstration workflow
print("\n🔄 Creating Demo Workflow")
print("=" * 30)

demo_workflow = orchestrator.create_property_prediction_workflow(
    name="Drug-like_Molecules_Analysis",
    molecules=['CCO', 'c1ccccc1', 'CC(=O)O', 'CCN', 'c1ccc(O)cc1'],
    target_properties=['energy', 'homo_lumo_gap', 'dipole_moment']
)

print(f"Created workflow: {demo_workflow['name']}")
print(f"Molecules: {len(demo_workflow['molecules'])}")
print(f"Stages: {demo_workflow['stages']}")

In [None]:
# Execute the demonstration workflow
print("🚀 Executing Demo Workflow")
print("=" * 35)

try:
    completed_workflow = orchestrator.execute_workflow("Drug-like_Molecules_Analysis")
    
    # Visualize the results
    print("\n📊 Generating Workflow Visualization")
    orchestrator.visualize_workflow_results("Drug-like_Molecules_Analysis")
    
    # Summary statistics
    print("\n📈 Workflow Summary")
    print("=" * 25)
    
    qm_results = completed_workflow['results']['qm_calculations']
    successful = [r for r in qm_results if r.success]
    
    print(f"Total QM calculations: {len(qm_results)}")
    print(f"Successful calculations: {len(successful)}")
    print(f"Success rate: {len(successful)/len(qm_results)*100:.1f}%")
    
    if successful:
        avg_energy_hf = np.mean([r.energy for r in successful if r.request.method == 'HF'])
        avg_energy_dft = np.mean([r.energy for r in successful if r.request.method == 'B3LYP'])
        print(f"Average HF energy: {avg_energy_hf:.4f} Hartree")
        print(f"Average DFT energy: {avg_energy_dft:.4f} Hartree")
    
    ml_results = completed_workflow['results'].get('ml_training')
    if ml_results:
        print(f"ML targets trained: {len(ml_results)}")
        for target, models in ml_results.items():
            if 'random_forest' in models:
                r2 = models['random_forest']['r2']
                print(f"  {target}: R² = {r2:.3f}")
    
except Exception as e:
    print(f"❌ Workflow execution failed: {str(e)}")
    print("This may be due to missing quantum chemistry software")

print("\n✅ Section 3: QM Data Pipeline completed!")
print("🎯 Key Achievements:")
print("   • Built automated QM calculation workflows")
print("   • Implemented high-throughput quantum calculations")
print("   • Created ML models for quantum property prediction")
print("   • Developed uncertainty quantification methods")

# ASSESSMENT CHECKPOINT 4.1: Quantum Chemistry Fundamentals
print("\n" + "="*70)
print("🎯 ASSESSMENT CHECKPOINT 4.1: Quantum Chemistry Fundamentals")
print("="*70)

assessment.start_section("section_1_fundamentals")

# Concept Assessment - Electronic Structure Theory
theory_concepts = {
    "electronic_structure": 0,
    "basis_sets": 0, 
    "hartree_fock": 0,
    "dft_methods": 0,
    "homo_lumo_gap": 0,
    "post_hf_methods": 0
}

# Evaluate implementation understanding
if hasattr(qm_engine, 'calculate_hartree_fock') and methane_hf.get('energy'):
    theory_concepts["hartree_fock"] = 1
    print("✅ Hartree-Fock implementation successful")

if hasattr(qm_engine, 'calculate_dft') and methane_dft.get('energy'):
    theory_concepts["dft_methods"] = 1
    print("✅ DFT implementation successful")

if methane_hf.get('homo_lumo_gap') is not None:
    theory_concepts["homo_lumo_gap"] = 1
    print("✅ HOMO-LUMO gap calculation successful")

if hasattr(qm_engine, 'smiles_to_geometry'):
    theory_concepts["electronic_structure"] = 1
    print("✅ Molecular geometry handling successful")

if hasattr(qm_engine, 'geometry_optimization') and ethylene_opt.get('converged'):
    theory_concepts["basis_sets"] = 1
    print("✅ Geometry optimization with basis sets successful")

if hasattr(qm_engine, 'calculate_mp2'):
    theory_concepts["post_hf_methods"] = 1
    print("✅ Post-HF methods implementation available")

# Calculate mastery score
fundamentals_score = sum(theory_concepts.values()) / len(theory_concepts)

# Activity-based assessment
activities_completed = []

# Check batch calculation completion
if 'batch_results' in locals() and not batch_results.empty:
    activities_completed.append("batch_calculations")
    assessment.record_activity(
        "batch_quantum_calculations",
        {"status": "completed", "molecules_calculated": len(batch_results)},
        {"method": "multiple", "success_rate": 1.0}
    )
    print("✅ Batch calculations completed successfully")

# Check method comparison
if methane_hf.get('energy') and methane_dft.get('energy'):
    energy_diff = abs(methane_hf['energy'] - methane_dft['energy'])
    activities_completed.append("method_comparison")
    assessment.record_activity(
        "hf_vs_dft_comparison",
        {"status": "completed", "energy_difference": energy_diff},
        {"methods_compared": ["HF", "B3LYP"], "basis": "6-31G"}
    )
    print(f"✅ HF vs DFT comparison: ΔE = {energy_diff:.6f} Hartree")

# Section completion
activity_score = len(activities_completed) / 2  # 2 expected activities
section_1_score = (fundamentals_score + activity_score) / 2

assessment.record_activity(
    "section_1_fundamentals",
    {
        "status": "completed",
        "mastery_score": fundamentals_score,
        "activity_score": activity_score,
        "overall_score": section_1_score
    },
    {
        "concepts_mastered": sum(theory_concepts.values()),
        "total_concepts": len(theory_concepts),
        "activities_completed": activities_completed
    }
)

assessment.end_section("section_1_fundamentals")

print(f"\n📊 Section 1 Assessment Results:")
print(f"   🧠 Theory Mastery: {fundamentals_score:.1%}")
print(f"   🔧 Activity Completion: {activity_score:.1%}")
print(f"   📈 Overall Score: {section_1_score:.1%}")

if section_1_score >= 0.8:
    print("🌟 Outstanding! Ready for advanced electronic structure ML")
elif section_1_score >= 0.6:
    print("✅ Good progress! Continue to electronic structure modeling")
else:
    print("📚 Consider reviewing quantum chemistry fundamentals")

else:
    print("⚠️ Assessment not available - continuing with learning content")

print("\n" + "="*70)
# 📊 **ASSESSMENT CHECKPOINT 4.2: Electronic Structure ML Models**

print("🎯 CHECKPOINT 4.2: Electronic Structure ML Assessment")
print("="*65)

if assessment:
    assessment.start_section("section_2_electronic_ml")
    
    # Concept Assessment - ML for Electronic Structure
    ml_concepts = {
        "feature_engineering": 0,
        "ml_models": 0,
        "neural_networks": 0,
        "transfer_learning": 0,
        "uncertainty_quantification": 0,
        "model_validation": 0
    }
    
    # Evaluate ML implementation
    if hasattr(esml, 'extract_features'):
        ml_concepts["feature_engineering"] = 1
        print("✅ Feature engineering implementation successful")
    
    if hasattr(esml, 'train_models') and 'train_results' in locals():
        ml_concepts["ml_models"] = 1
        print("✅ ML model training successful")
    
    if hasattr(esml, '_train_neural_network'):
        ml_concepts["neural_networks"] = 1
        print("✅ Neural network implementation available")
    
    if hasattr(esml, 'transfer_learning'):
        ml_concepts["transfer_learning"] = 1
        print("✅ Transfer learning implementation available")
    
    if hasattr(esml, 'predict_with_uncertainty'):
        ml_concepts["uncertainty_quantification"] = 1
        print("✅ Uncertainty quantification implemented")
    
    if hasattr(esml, 'cross_validate'):
        ml_concepts["model_validation"] = 1
        print("✅ Model validation methods available")
    
    # Calculate mastery score
    ml_mastery_score = sum(ml_concepts.values()) / len(ml_concepts)
    
    # Activity-based assessment
    ml_activities_completed = []
    
    # Check model training completion
    if 'train_results' in locals() and train_results:
        ml_activities_completed.append("model_training")
        
        # Evaluate model performance
        best_model_performance = 0.0
        for target, models in train_results.items():
            for model_name, metrics in models.items():
                if 'r2' in metrics and metrics['r2'] > best_model_performance:
                    best_model_performance = metrics['r2']
        
        assessment.record_activity(
            "ml_model_training",
            {
                "status": "completed", 
                "best_r2_score": best_model_performance,
                "models_trained": sum(len(models) for models in train_results.values())
            },
            {"targets": list(train_results.keys()), "performance": best_model_performance}
        )
        print(f"✅ ML model training completed - Best R² score: {best_model_performance:.4f}")
    
    # Check visualization completion
    if 'comparison_plot' in locals() or hasattr(esml, 'plot_model_comparison'):
        ml_activities_completed.append("performance_analysis")
        assessment.record_activity(
            "model_performance_analysis",
            {"status": "completed", "visualizations_created": True},
            {"analysis_type": "comparative_performance"}
        )
        print("✅ Model performance analysis completed")
    
    # Check uncertainty quantification
    if hasattr(esml, 'predict_with_uncertainty'):
        ml_activities_completed.append("uncertainty_analysis")
        assessment.record_activity(
            "uncertainty_quantification",
            {"status": "implemented", "method": "ensemble_based"},
            {"uncertainty_method": "model_ensemble"}
        )
        print("✅ Uncertainty quantification implemented")
    
    # Section completion assessment
    ml_activity_score = len(ml_activities_completed) / 3  # 3 expected activities
    section_2_score = (ml_mastery_score + ml_activity_score) / 2
    
    assessment.record_activity(
        "section_2_electronic_ml",
        {
            "status": "completed",
            "mastery_score": ml_mastery_score,
            "activity_score": ml_activity_score,
            "overall_score": section_2_score
        },
        {
            "concepts_mastered": sum(ml_concepts.values()),
            "total_concepts": len(ml_concepts),
            "activities_completed": ml_activities_completed,
            "ml_framework": "implemented"
        }
    )
    
    assessment.end_section("section_2_electronic_ml")
    
    print(f"\n📊 Section 2 Assessment Results:")
    print(f"   🧠 ML Theory Mastery: {ml_mastery_score:.1%}")
    print(f"   🔧 Implementation Completion: {ml_activity_score:.1%}")
    print(f"   📈 Overall Score: {section_2_score:.1%}")
    
    if section_2_score >= 0.8:
        print("🌟 Excellent! Advanced ML techniques mastered")
    elif section_2_score >= 0.6:
        print("✅ Good progress! Ready for QM pipeline development")
    else:
        print("📚 Consider reviewing ML fundamentals for QM applications")

else:
    print("⚠️ Assessment not available - continuing with learning content")

print("\n" + "="*65)
# 📊 **ASSESSMENT CHECKPOINT 4.3: QM Data Pipeline & High-Throughput**

print("🎯 CHECKPOINT 4.3: QM Pipeline & Workflow Assessment")
print("="*65)

if assessment:
    assessment.start_section("section_3_qm_pipeline")
    
    # Concept Assessment - Pipeline & Automation
    pipeline_concepts = {
        "automated_workflows": 0,
        "high_throughput_qm": 0,
        "database_integration": 0,
        "parallel_processing": 0,
        "workflow_orchestration": 0,
        "pipeline_monitoring": 0
    }
    
    # Evaluate pipeline implementation
    if hasattr(ht_qm, 'submit_calculation_batch'):
        pipeline_concepts["high_throughput_qm"] = 1
        print("✅ High-throughput QM implementation successful")
    
    if hasattr(db_manager, 'store_calculation_result'):
        pipeline_concepts["database_integration"] = 1
        print("✅ Database integration implemented")
    
    if hasattr(ht_qm, 'process_calculation_queue'):
        pipeline_concepts["parallel_processing"] = 1
        print("✅ Parallel processing capability available")
    
    if hasattr(orchestrator, 'create_property_prediction_workflow'):
        pipeline_concepts["workflow_orchestration"] = 1
        print("✅ Workflow orchestration implemented")
    
    if hasattr(orchestrator, 'execute_workflow'):
        pipeline_concepts["automated_workflows"] = 1
        print("✅ Automated workflow execution available")
    
    if hasattr(orchestrator, 'visualize_workflow_results'):
        pipeline_concepts["pipeline_monitoring"] = 1
        print("✅ Pipeline monitoring and visualization implemented")
    
    # Calculate mastery score
    pipeline_mastery_score = sum(pipeline_concepts.values()) / len(pipeline_concepts)
    
    # Activity-based assessment
    pipeline_activities_completed = []
    
    # Check workflow execution
    if 'completed_workflow' in locals() and completed_workflow:
        pipeline_activities_completed.append("workflow_execution")
        
        qm_results = completed_workflow['results'].get('qm_calculations', [])
        successful_calcs = [r for r in qm_results if r.success]
        success_rate = len(successful_calcs) / len(qm_results) if qm_results else 0
        
        assessment.record_activity(
            "qm_pipeline_execution",
            {
                "status": "completed",
                "total_calculations": len(qm_results),
                "successful_calculations": len(successful_calcs),
                "success_rate": success_rate
            },
            {
                "workflow_type": "property_prediction",
                "pipeline_stages": completed_workflow.get('stages', [])
            }
        )
        print(f"✅ QM pipeline executed - Success rate: {success_rate:.1%}")
    
    # Check database operations
    if hasattr(db_manager, 'get_calculation_summary'):
        pipeline_activities_completed.append("database_operations")
        assessment.record_activity(
            "database_management",
            {"status": "implemented", "operations": "CRUD_complete"},
            {"database_type": "SQLite", "table_structure": "optimized"}
        )
        print("✅ Database operations implemented")
    
    # Check parallel processing performance
    if hasattr(ht_qm, 'max_workers') and ht_qm.max_workers > 1:
        pipeline_activities_completed.append("parallel_optimization")
        assessment.record_activity(
            "parallel_processing_optimization",
            {"status": "configured", "max_workers": ht_qm.max_workers},
            {"processing_type": "ThreadPoolExecutor", "optimization": "enabled"}
        )
        print(f"✅ Parallel processing configured - {ht_qm.max_workers} workers")
    
    # Section completion assessment
    pipeline_activity_score = len(pipeline_activities_completed) / 3  # 3 expected activities
    section_3_score = (pipeline_mastery_score + pipeline_activity_score) / 2
    
    assessment.record_activity(
        "section_3_qm_pipeline",
        {
            "status": "completed",
            "mastery_score": pipeline_mastery_score,
            "activity_score": pipeline_activity_score,
            "overall_score": section_3_score
        },
        {
            "concepts_mastered": sum(pipeline_concepts.values()),
            "total_concepts": len(pipeline_concepts),
            "activities_completed": pipeline_activities_completed,
            "pipeline_framework": "production_ready"
        }
    )
    
    assessment.end_section("section_3_qm_pipeline")
    
    print(f"\n📊 Section 3 Assessment Results:")
    print(f"   🧠 Pipeline Theory Mastery: {pipeline_mastery_score:.1%}")
    print(f"   🔧 Implementation Completion: {pipeline_activity_score:.1%}")
    print(f"   📈 Overall Score: {section_3_score:.1%}")
    
    if section_3_score >= 0.8:
        print("🌟 Excellent! Production-ready QM pipeline mastered")
    elif section_3_score >= 0.6:
        print("✅ Good progress! Ready for advanced quantum-ML integration")
    else:
        print("📚 Consider reviewing pipeline development fundamentals")

else:
    print("⚠️ Assessment not available - continuing with learning content")

print("\n" + "="*65)
# 🎯 **FINAL ASSESSMENT: Day 4 Quantum Chemistry Project**

print("🎯 FINAL ASSESSMENT: Day 4 Quantum Chemistry Mastery Evaluation")
print("="*75)

if assessment:
    assessment.start_section("day_4_final_assessment")
    
    # Comprehensive Day 4 evaluation
    print("📊 Comprehensive Day 4 Assessment")
    print("-" * 40)
    
    # Get section scores
    progress = assessment.get_progress_summary()
    day_4_sections = {
        "section_1_fundamentals": 0,
        "section_2_electronic_ml": 0,
        "section_3_qm_pipeline": 0
    }
    
    # Extract section scores from progress
    section_scores = progress.get('section_scores', {})
    for section in day_4_sections:
        if section in section_scores:
            day_4_sections[section] = section_scores[section]
    
    # Display section performance
    print("\n📈 Section Performance Summary:")
    section_names = {
        "section_1_fundamentals": "🧮 Quantum Chemistry Fundamentals",
        "section_2_electronic_ml": "🧠 Electronic Structure ML",
        "section_3_qm_pipeline": "⚙️ QM Data Pipeline"
    }
    
    total_score = 0
    completed_sections = 0
    
    for section_key, section_name in section_names.items():
        score = day_4_sections[section_key]
        if score > 0:
            print(f"   {section_name}: {score:.1%}")
            total_score += score
            completed_sections += 1
        else:
            print(f"   {section_name}: Not completed")
    
    # Calculate overall Day 4 score
    day_4_overall_score = total_score / max(completed_sections, 1)
    
    # Advanced mastery assessment
    advanced_skills = {
        "quantum_theory_mastery": day_4_sections["section_1_fundamentals"] >= 0.8,
        "ml_integration_expert": day_4_sections["section_2_electronic_ml"] >= 0.8,
        "pipeline_architect": day_4_sections["section_3_qm_pipeline"] >= 0.8,
        "production_ready": day_4_overall_score >= 0.7
    }
    
    advanced_mastery_score = sum(advanced_skills.values()) / len(advanced_skills)
    
    # Practical implementation assessment
    practical_skills = []
    
    # Check QM engine implementation
    if hasattr(qm_engine, 'batch_calculate') and 'batch_results' in locals():
        practical_skills.append("quantum_calculations")
    
    # Check ML framework implementation
    if hasattr(esml, 'train_models') and 'train_results' in locals():
        practical_skills.append("ml_modeling")
    
    # Check pipeline implementation
    if hasattr(orchestrator, 'execute_workflow') and 'completed_workflow' in locals():
        practical_skills.append("workflow_automation")
    
    practical_implementation_score = len(practical_skills) / 3  # 3 core skills
    
    # Day 5 readiness assessment
    day_5_readiness_criteria = {
        "quantum_fundamentals": day_4_sections["section_1_fundamentals"] >= 0.6,
        "ml_experience": day_4_sections["section_2_electronic_ml"] >= 0.6,
        "automation_skills": day_4_sections["section_3_qm_pipeline"] >= 0.6,
        "overall_competency": day_4_overall_score >= 0.6
    }
    
    day_5_readiness = sum(day_5_readiness_criteria.values()) / len(day_5_readiness_criteria)
    
    # Record final assessment
    assessment.record_activity(
        "day_4_final_comprehensive_assessment",
        {
            "status": "completed",
            "overall_score": day_4_overall_score,
            "advanced_mastery": advanced_mastery_score,
            "practical_implementation": practical_implementation_score,
            "day_5_readiness": day_5_readiness,
            "completed_sections": completed_sections,
            "total_sections": 3
        },
        {
            "section_breakdown": day_4_sections,
            "advanced_skills": advanced_skills,
            "practical_skills": practical_skills,
            "specialization_track": track_selected,
            "readiness_criteria": day_5_readiness_criteria
        }
    )
    
    assessment.end_section("day_4_final_assessment")
    assessment.end_section("day_4_quantum_chemistry")
    
    # Display comprehensive results
    print(f"\n🎯 Day 4 Final Assessment Results:")
    print(f"   📊 Overall Score: {day_4_overall_score:.1%}")
    print(f"   🌟 Advanced Mastery: {advanced_mastery_score:.1%}")
    print(f"   🔧 Practical Implementation: {practical_implementation_score:.1%}")
    print(f"   📈 Day 5 Readiness: {day_5_readiness:.1%}")
    
    print(f"\n📋 Completed Sections: {completed_sections}/3")
    print(f"🛠️ Practical Skills Mastered: {len(practical_skills)}/3")
    
    # Performance evaluation and recommendations
    print(f"\n🎓 Performance Evaluation:")
    if day_4_overall_score >= 0.9:
        print("🌟 OUTSTANDING! Exceptional quantum chemistry mastery")
        print("🚀 Ready for advanced quantum ML and quantum computing applications")
    elif day_4_overall_score >= 0.8:
        print("🌟 EXCELLENT! Strong quantum chemistry foundation")
        print("✅ Well-prepared for Day 5: Quantum ML specialization")
    elif day_4_overall_score >= 0.7:
        print("✅ GOOD! Solid understanding of quantum chemistry concepts")
        print("📚 Minor review recommended before Day 5")
    elif day_4_overall_score >= 0.6:
        print("📚 DEVELOPING! Basic competency achieved")
        print("⚠️ Additional practice recommended for quantum concepts")
    else:
        print("📚 NEEDS IMPROVEMENT! Foundation concepts require reinforcement")
        print("🔄 Consider reviewing Day 4 materials before proceeding")
    
    # Day 5 readiness assessment
    print(f"\n🔮 Day 5 Readiness Assessment:")
    if day_5_readiness >= 0.8:
        print("🚀 FULLY READY for Day 5: Quantum ML & Advanced Applications")
        print("🎯 Recommended track: Advanced Quantum Machine Learning")
    elif day_5_readiness >= 0.6:
        print("✅ READY for Day 5 with standard preparation")
        print("📖 Brief review of quantum fundamentals recommended")
    else:
        print("⚠️ ADDITIONAL PREPARATION needed for Day 5")
        print("📚 Focus on strengthening quantum chemistry foundations")
    
    # Generate and save final report
    final_report = assessment.get_comprehensive_report()
    
    try:
        report_filename = f"day_4_assessment_report_{student_id}.json"
        assessment.save_final_report(report_filename)
        print(f"\n📄 Final assessment report saved: {report_filename}")
    except Exception as e:
        print(f"⚠️ Could not save report: {e}")
    
    print(f"\n🎉 Day 4 Quantum Chemistry Project Assessment Complete!")
    print("=" * 75)

else:
    print("⚠️ Assessment framework not available")
    print("📋 Manual evaluation recommended based on:")
    print("   • Quantum chemistry engine implementation")
    print("   • Electronic structure ML model development")
    print("   • QM pipeline and workflow automation")
    print("   • Code quality and documentation")

# Celebration and transition
print("\n🎉 CONGRATULATIONS! Day 4 Quantum Chemistry Project Complete!")
print("="*70)
print("🎯 Key Achievements:")
print("   ⚛️ Mastered quantum chemistry fundamentals") 
print("   🧠 Integrated ML with electronic structure theory")
print("   ⚙️ Built production-ready QM calculation pipelines")
print("   🚀 Developed automated workflow orchestration")
print("\n🔮 Coming Next: Day 5 - Quantum ML & Advanced Applications")
print("   • Quantum neural networks")
print("   • Variational quantum algorithms") 
print("   • Quantum-enhanced drug discovery")
print("   • Production quantum ML systems")
print("="*70)