In [None]:
# =============================================================================
# GNINA Docking Notebook - Enhanced for Google Colab with GPU Support
# =============================================================================
# This notebook provides a comprehensive molecular docking workflow using GNINA
# with optimized settings for Google Colab environment and GPU acceleration

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import essential libraries
import os
import sys
import subprocess
import pathlib
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Environment setup complete")
print(f"Python version: {sys.version}")
print(f"Working directory: {os.getcwd()}")

Mounted at /content/drive


In [None]:
# =============================================================================
# Configuration and Setup
# =============================================================================

# Set working directory
WORK_DIR = "/content/drive/MyDrive/Sertaline_Derv_docking"
%cd {WORK_DIR}

# Configuration parameters
CONFIG = {
    'project_name': 'Sertaline_Derv_docking',
    'gnina_version': 'v1.3.2',  # Latest version
    'exhaustiveness': 32,        # Higher for better sampling
    'num_modes': 20,            # Number of poses to generate
    'seed': 42,                 # For reproducibility
    'cnn_scoring': 'rescore',   # Use CNN for final scoring
    'cpu_cores': 4,             # Number of CPU cores to use
    'batch_size': 5,            # Process in batches
    'timeout': 300,             # Timeout per docking (seconds)
}

# Create directory structure
dirs_to_create = [
    'ligands_raw', 'ligands_prep', 'receptors_raw', 'receptors_prep', 
    'gnina_out', 'results', 'logs', 'visualizations'
]

for dir_name in dirs_to_create:
    os.makedirs(dir_name, exist_ok=True)

print(f"‚úÖ Working in: {WORK_DIR}")
print(f"‚úÖ Configuration: {CONFIG}")
print(f"‚úÖ Directories created: {dirs_to_create}")

/content/drive/MyDrive/Sertaline_Derv_docking


In [None]:
# =============================================================================
# Install Dependencies and Download GNINA
# =============================================================================

# Install required packages
print("üì¶ Installing dependencies...")
%pip install -q rdkit-pypi meeko pdb2pqr openbabel
!apt-get update -qq && apt-get install -y -qq openbabel pdb2pqr

# Download GNINA binary (GPU-enabled version)
print("‚¨áÔ∏è Downloading GNINA binary...")
gnina_url = f"https://github.com/gnina/gnina/releases/download/{CONFIG['gnina_version']}/gnina"
!wget -q {gnina_url} -O gnina

# Make executable
!chmod +x gnina

# Verify installation
print("üîç Verifying GNINA installation...")
!./gnina --version

# Check GPU availability
print("\nüñ•Ô∏è Checking GPU availability...")
try:
    gpu_info = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
    if gpu_info.returncode == 0:
        print("‚úÖ GPU detected:")
        print(gpu_info.stdout.split('\n')[0:3])  # Show first few lines
        CONFIG['use_gpu'] = True
    else:
        print("‚ö†Ô∏è No GPU detected, using CPU")
        CONFIG['use_gpu'] = False
except:
    print("‚ö†Ô∏è GPU check failed, using CPU")
    CONFIG['use_gpu'] = False

print(f"‚úÖ GNINA setup complete. GPU mode: {CONFIG['use_gpu']}")

--2025-05-19 15:11:01--  https://github.com/gnina/gnina/releases/download/v1.3/gnina
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/45548146/f37c6a31-c8d1-4c4f-9748-c7c6f727e868?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20250519%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250519T150856Z&X-Amz-Expires=300&X-Amz-Signature=d5cb814fe380223e9e280f1522c9b195f4ecb006337d29af4fd90f4a96000480&X-Amz-SignedHeaders=host&response-content-disposition=attachment%3B%20filename%3Dgnina&response-content-type=application%2Foctet-stream [following]
--2025-05-19 15:11:01--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/45548146/f37c6a31-c8d1-4c4f-9748-c7c6f727e868?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%

In [None]:
# =============================================================================
# File Preparation and Validation
# =============================================================================

def validate_project_structure():
    """Validate that all required files and directories exist"""
    required_files = ['pairlist.csv']
    required_dirs = ['ligands_raw', 'receptors_raw']
    
    missing_files = []
    missing_dirs = []
    
    for file in required_files:
        if not os.path.exists(file):
            missing_files.append(file)
    
    for dir_name in required_dirs:
        if not os.path.exists(dir_name):
            missing_dirs.append(dir_name)
    
    if missing_files or missing_dirs:
        print("‚ùå Missing required files/directories:")
        for item in missing_files + missing_dirs:
            print(f"   - {item}")
        return False
    
    print("‚úÖ Project structure validated")
    return True

def load_and_validate_pairlist():
    """Load and validate the pairlist.csv file"""
    try:
        # Load pairlist with flexible column handling
        df = pd.read_csv('pairlist.csv', skipinitialspace=True)
        
        # Normalize column names
        df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]
        
        # Required columns
        required_cols = ['receptor', 'ligand', 'center_x', 'center_y', 'center_z', 
                        'size_x', 'size_y', 'size_z']
        
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            print(f"‚ùå Missing columns in pairlist.csv: {missing_cols}")
            return None
        
        # Validate numeric columns
        numeric_cols = ['center_x', 'center_y', 'center_z', 'size_x', 'size_y', 'size_z']
        for col in numeric_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            if df[col].isna().any():
                print(f"‚ùå Non-numeric values found in column: {col}")
                return None
        
        # Add site_id if missing
        if 'site_id' not in df.columns:
            df['site_id'] = 'site1'
        
        print(f"‚úÖ Pairlist loaded successfully: {len(df)} entries")
        print(f"   Receptors: {df['receptor'].nunique()}")
        print(f"   Ligands: {df['ligand'].nunique()}")
        
        return df
        
    except Exception as e:
        print(f"‚ùå Error loading pairlist.csv: {e}")
        return None

# Validate project structure
if not validate_project_structure():
    print("Please ensure all required files and directories are present")
else:
    # Load pairlist
    pairlist_df = load_and_validate_pairlist()
    
    if pairlist_df is not None:
        print("\nüìä Pairlist preview:")
        print(pairlist_df.head())

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
openbabel is already the newest version (3.1.1+dfsg-6ubuntu5).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.


In [None]:
# =============================================================================
# Advanced GNINA Configuration and Helper Functions
# =============================================================================

class GNINADocker:
    """Enhanced GNINA docking class with advanced features"""
    
    def __init__(self, config):
        self.config = config
        self.gnina_bin = "./gnina"
        self.results = []
        self.failures = []
        
    def build_gnina_command(self, row):
        """Build optimized GNINA command with advanced parameters"""
        receptor = f"receptors_prep/{row['receptor']}.pdbqt"
        ligand = f"ligands_prep/{row['ligand']}.pdbqt"
        
        # Output files
        tag = f"{row['receptor']}_{row['site_id']}_{row['ligand']}"
        output_sdf = f"gnina_out/{tag}_poses.sdf"
        log_file = f"logs/{tag}.log"
        
        # Base command
        cmd = [
            self.gnina_bin,
            "--receptor", receptor,
            "--ligand", ligand,
            "--out", output_sdf,
            "--log", log_file,
        ]
        
        # Docking box parameters
        cmd.extend([
            "--center_x", str(row['center_x']),
            "--center_y", str(row['center_y']),
            "--center_z", str(row['center_z']),
            "--size_x", str(row['size_x']),
            "--size_y", str(row['size_y']),
            "--size_z", str(row['size_z']),
        ])
        
        # Performance parameters
        cmd.extend([
            "--exhaustiveness", str(self.config['exhaustiveness']),
            "--num_modes", str(self.config['num_modes']),
            "--seed", str(self.config['seed']),
            "--cpu", str(self.config['cpu_cores']),
        ])
        
        # Advanced GNINA features
        cmd.extend([
            "--cnn_scoring", self.config['cnn_scoring'],
            "--cnn_rotation", "0",  # No rotation for speed
            "--min_rmsd_filter", "1.0",
            "--pose_sort_order", "0",  # Sort by CNN score
        ])
        
        # GPU support
        if self.config['use_gpu']:
            cmd.append("--gpu")
            cmd.extend(["--device", "0"])
        
        return cmd, output_sdf, log_file
    
    def run_docking(self, row):
        """Run docking for a single receptor-ligand pair"""
        try:
            cmd, output_sdf, log_file = self.build_gnina_command(row)
            
            # Run command with timeout
            result = subprocess.run(
                cmd,
                capture_output=True,
                text=True,
                timeout=self.config['timeout']
            )
            
            if result.returncode == 0:
                # Parse results
                scores = self.parse_gnina_log(log_file)
                return {
                    'status': 'success',
                    'output_file': output_sdf,
                    'log_file': log_file,
                    'scores': scores,
                    'row': row
                }
            else:
                return {
                    'status': 'failed',
                    'error': result.stderr,
                    'row': row
                }
                
        except subprocess.TimeoutExpired:
            return {
                'status': 'timeout',
                'error': f"Docking timed out after {self.config['timeout']} seconds",
                'row': row
            }
        except Exception as e:
            return {
                'status': 'error',
                'error': str(e),
                'row': row
            }
    
    def parse_gnina_log(self, log_file):
        """Parse GNINA log file to extract scores"""
        scores = []
        try:
            with open(log_file, 'r') as f:
                for line in f:
                    if 'CNNaffinity' in line and 'CNNscore' in line:
                        parts = line.strip().split()
                        if len(parts) >= 4:
                            scores.append({
                                'cnn_affinity': float(parts[1]),
                                'cnn_score': float(parts[3].strip('()'))
                            })
        except Exception as e:
            print(f"Warning: Could not parse log file {log_file}: {e}")
        
        return scores

# Initialize GNINA docker
gnina_docker = GNINADocker(CONFIG)
print("‚úÖ GNINA Docker initialized with advanced configuration")

In [None]:
# =============================================================================
# Execute Docking Workflow
# =============================================================================

def run_batch_docking(pairlist_df, batch_size=None):
    """Run docking for all pairs with progress tracking and error handling"""
    
    if batch_size is None:
        batch_size = CONFIG['batch_size']
    
    total_pairs = len(pairlist_df)
    successful = 0
    failed = 0
    
    print(f"üöÄ Starting batch docking: {total_pairs} pairs")
    print(f"   Batch size: {batch_size}")
    print(f"   GPU mode: {CONFIG['use_gpu']}")
    print(f"   Exhaustiveness: {CONFIG['exhaustiveness']}")
    
    # Process in batches
    for batch_start in tqdm(range(0, total_pairs, batch_size), desc="Processing batches"):
        batch_end = min(batch_start + batch_size, total_pairs)
        batch_df = pairlist_df.iloc[batch_start:batch_end]
        
        print(f"\nüì¶ Processing batch {batch_start//batch_size + 1}: pairs {batch_start+1}-{batch_end}")
        
        batch_results = []
        for idx, row in batch_df.iterrows():
            result = gnina_docker.run_docking(row)
            batch_results.append(result)
            
            if result['status'] == 'success':
                successful += 1
                print(f"   ‚úÖ {row['receptor']}-{row['ligand']}")
            else:
                failed += 1
                print(f"   ‚ùå {row['receptor']}-{row['ligand']}: {result['status']}")
        
        # Save batch results
        batch_results_df = pd.DataFrame(batch_results)
        batch_results_df.to_csv(f"results/batch_{batch_start//batch_size + 1}_results.csv", index=False)
    
    # Summary
    print(f"\nüìä Docking Summary:")
    print(f"   Total pairs: {total_pairs}")
    print(f"   Successful: {successful}")
    print(f"   Failed: {failed}")
    print(f"   Success rate: {successful/total_pairs*100:.1f}%")
    
    return successful, failed

# Check if pairlist is loaded
if 'pairlist_df' in locals() and pairlist_df is not None:
    # Run docking
    successful, failed = run_batch_docking(pairlist_df)
    
    # Save overall results
    results_summary = {
        'total_pairs': len(pairlist_df),
        'successful': successful,
        'failed': failed,
        'success_rate': successful/len(pairlist_df)*100,
        'config': CONFIG
    }
    
    with open('results/docking_summary.json', 'w') as f:
        import json
        json.dump(results_summary, f, indent=2)
    
    print("‚úÖ Docking workflow completed!")
else:
    print("‚ùå Please ensure pairlist.csv is loaded before running docking")

gnina v1.3 master:97fa6bc+   Built Oct  3 2024.


In [None]:
# =============================================================================
# Tiered CNN Scoring Workflow Implementation
# =============================================================================

class TieredGNINAWorkflow:
    """
    Implements a tiered CNN scoring approach for efficient molecular docking:
    Stage A: Broad screen with rescore (fast)
    Stage B: Focused re-dock with refinement (balanced) 
    Stage C: Finalists with all CNN (expensive, optional)
    """
    
    def __init__(self, config):
        self.config = config
        self.gnina_bin = "./gnina"
        self.stage_results = {}
        
        # Stage configurations
        self.stage_configs = {
            'A': {  # Broad screen
                'cnn_scoring': 'rescore',
                'exhaustiveness': 12,
                'num_modes': 8,
                'description': 'Fast broad screening',
                'cnn_score_threshold': 0.5,  # Relaxed threshold
                'max_ligands_per_receptor': None  # No limit
            },
            'B': {  # Focused re-dock
                'cnn_scoring': 'refinement', 
                'exhaustiveness': 24,
                'num_modes': 15,
                'description': 'Balanced refinement',
                'cnn_score_threshold': 0.7,  # Stricter threshold
                'max_ligands_per_receptor': 5,  # Top 5 per receptor
                'top_percentage': 0.05  # Top 5% from Stage A
            },
            'C': {  # Finalists (optional)
                'cnn_scoring': 'all',
                'exhaustiveness': 48,
                'num_modes': 20,
                'description': 'High-accuracy final screening',
                'cnn_score_threshold': 0.8,  # Very strict
                'max_ligands_per_receptor': 2,  # Top 2 per receptor
                'top_percentage': 0.01  # Top 1% from Stage B
            }
        }
    
    def run_stage(self, stage, pairlist_df, previous_results=None):
        """Run a specific stage of the tiered workflow"""
        stage_config = self.stage_configs[stage]
        
        print(f"\nüöÄ Starting Stage {stage}: {stage_config['description']}")
        print(f"   CNN Scoring: {stage_config['cnn_scoring']}")
        print(f"   Exhaustiveness: {stage_config['exhaustiveness']}")
        print(f"   Num Modes: {stage_config['num_modes']}")
        
        # Filter input based on previous stage results
        if stage == 'A':
            input_df = pairlist_df.copy()
        else:
            input_df = self._filter_for_stage(stage, previous_results, pairlist_df)
        
        if len(input_df) == 0:
            print(f"   ‚ö†Ô∏è No ligands to process for Stage {stage}")
            return []
        
        print(f"   Processing {len(input_df)} ligand-receptor pairs")
        
        # Update config for this stage
        stage_config_copy = self.config.copy()
        stage_config_copy.update({
            'cnn_scoring': stage_config['cnn_scoring'],
            'exhaustiveness': stage_config['exhaustiveness'],
            'num_modes': stage_config['num_modes']
        })
        
        # Run docking
        gnina_docker = GNINADocker(stage_config_copy)
        results = []
        
        for idx, row in tqdm(input_df.iterrows(), total=len(input_df), desc=f"Stage {stage}"):
            result = gnina_docker.run_docking(row)
            result['stage'] = stage
            result['stage_config'] = stage_config
            results.append(result)
        
        # Save stage results
        self.stage_results[stage] = results
        self._save_stage_results(stage, results)
        
        # Print stage summary
        self._print_stage_summary(stage, results)
        
        return results
    
    def _filter_for_stage(self, stage, previous_results, pairlist_df):
        """Filter ligands for subsequent stages based on previous results"""
        if not previous_results:
            return pairlist_df
        
        # Extract successful results with scores
        successful_results = [r for r in previous_results if r['status'] == 'success']
        
        if not successful_results:
            return pd.DataFrame()
        
        # Create results DataFrame
        results_df = pd.DataFrame([
            {
                'receptor': r['row']['receptor'],
                'ligand': r['row']['ligand'],
                'site_id': r['row']['site_id'],
                'cnn_score': max([s.get('cnn_score', 0) for s in r.get('scores', [])], default=0),
                'cnn_affinity': max([s.get('cnn_affinity', 0) for s in r.get('scores', [])], default=0)
            }
            for r in successful_results
        ])
        
        stage_config = self.stage_configs[stage]
        
        # Apply filters
        filtered_df = results_df[
            results_df['cnn_score'] >= stage_config['cnn_score_threshold']
        ].copy()
        
        # Apply top percentage filter
        if 'top_percentage' in stage_config:
            top_count = max(1, int(len(filtered_df) * stage_config['top_percentage']))
            filtered_df = filtered_df.nlargest(top_count, 'cnn_score')
        
        # Apply per-receptor limit
        if stage_config['max_ligands_per_receptor']:
            filtered_df = (filtered_df.groupby('receptor')
                          .apply(lambda x: x.nlargest(stage_config['max_ligands_per_receptor'], 'cnn_score'))
                          .reset_index(drop=True))
        
        # Merge back with original pairlist to get full row data
        if len(filtered_df) > 0:
            merged_df = pairlist_df.merge(
                filtered_df[['receptor', 'ligand', 'site_id']], 
                on=['receptor', 'ligand', 'site_id']
            )
            return merged_df
        
        return pd.DataFrame()
    
    def _save_stage_results(self, stage, results):
        """Save stage results to files"""
        stage_dir = f"results/stage_{stage}"
        os.makedirs(stage_dir, exist_ok=True)
        
        # Save detailed results
        results_df = pd.DataFrame(results)
        results_df.to_csv(f"{stage_dir}/stage_{stage}_results.csv", index=False)
        
        # Save summary
        successful = [r for r in results if r['status'] == 'success']
        if successful:
            summary_data = []
            for result in successful:
                scores = result.get('scores', [])
                if scores:
                    best_score = max(scores, key=lambda x: x.get('cnn_score', 0))
                    summary_data.append({
                        'receptor': result['row']['receptor'],
                        'ligand': result['row']['ligand'],
                        'site_id': result['row']['site_id'],
                        'cnn_score': best_score.get('cnn_score', 0),
                        'cnn_affinity': best_score.get('cnn_affinity', 0),
                        'output_file': result.get('output_file', ''),
                        'log_file': result.get('log_file', '')
                    })
            
            if summary_data:
                summary_df = pd.DataFrame(summary_data)
                summary_df.to_csv(f"{stage_dir}/stage_{stage}_summary.csv", index=False)
    
    def _print_stage_summary(self, stage, results):
        """Print summary for the stage"""
        total = len(results)
        successful = len([r for r in results if r['status'] == 'success'])
        failed = total - successful
        
        print(f"\nüìä Stage {stage} Summary:")
        print(f"   Total pairs: {total}")
        print(f"   Successful: {successful}")
        print(f"   Failed: {failed}")
        print(f"   Success rate: {successful/total*100:.1f}%")
        
        if successful > 0:
            successful_results = [r for r in results if r['status'] == 'success']
            all_scores = []
            for result in successful_results:
                scores = result.get('scores', [])
                if scores:
                    all_scores.extend([s.get('cnn_score', 0) for s in scores])
            
            if all_scores:
                print(f"   CNN Score range: {min(all_scores):.3f} - {max(all_scores):.3f}")
                print(f"   CNN Score mean: {np.mean(all_scores):.3f}")
    
    def run_complete_workflow(self, pairlist_df, stages=['A', 'B']):
        """Run the complete tiered workflow"""
        print("üéØ Starting Tiered GNINA Workflow")
        print(f"   Stages to run: {stages}")
        
        all_results = []
        previous_results = None
        
        for stage in stages:
            stage_results = self.run_stage(stage, pairlist_df, previous_results)
            all_results.extend(stage_results)
            previous_results = stage_results
            
            # Check if we should continue
            if stage in ['A', 'B'] and len(stage_results) == 0:
                print(f"   ‚ö†Ô∏è No results from Stage {stage}, stopping workflow")
                break
        
        # Save complete workflow results
        self._save_workflow_summary(all_results)
        
        return all_results
    
    def _save_workflow_summary(self, all_results):
        """Save complete workflow summary"""
        workflow_summary = {
            'total_pairs_processed': len(all_results),
            'stages_completed': list(self.stage_results.keys()),
            'config': self.config,
            'stage_configs': self.stage_configs
        }
        
        with open('results/workflow_summary.json', 'w') as f:
            import json
            json.dump(workflow_summary, f, indent=2)
        
        print("\n‚úÖ Workflow summary saved to results/workflow_summary.json")

# Initialize tiered workflow
tiered_workflow = TieredGNINAWorkflow(CONFIG)
print("‚úÖ Tiered GNINA Workflow initialized")


In [None]:
# =============================================================================
# Parallel Processing Implementation
# =============================================================================

import concurrent.futures
import multiprocessing as mp
from functools import partial
import threading
import queue
import time

class ParallelGNINADocker:
    """Enhanced GNINA docker with parallel processing capabilities"""
    
    def __init__(self, config, max_workers=None):
        self.config = config
        self.gnina_bin = "./gnina"
        self.max_workers = max_workers or min(4, mp.cpu_count())
        self.results_queue = queue.Queue()
        self.progress_lock = threading.Lock()
        
    def run_docking_parallel(self, pairlist_df, batch_size=None):
        """Run docking with parallel processing"""
        if batch_size is None:
            batch_size = self.config.get('batch_size', 5)
        
        total_pairs = len(pairlist_df)
        print(f"üöÄ Starting parallel docking: {total_pairs} pairs")
        print(f"   Max workers: {self.max_workers}")
        print(f"   Batch size: {batch_size}")
        
        # Split into batches
        batches = [pairlist_df.iloc[i:i+batch_size] for i in range(0, total_pairs, batch_size)]
        
        all_results = []
        successful = 0
        failed = 0
        
        # Process batches in parallel
        with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            # Submit all batches
            future_to_batch = {
                executor.submit(self._process_batch, batch, batch_idx): batch_idx 
                for batch_idx, batch in enumerate(batches)
            }
            
            # Collect results as they complete
            for future in tqdm(concurrent.futures.as_completed(future_to_batch), 
                             total=len(batches), desc="Processing batches"):
                batch_idx = future_to_batch[future]
                try:
                    batch_results = future.result()
                    all_results.extend(batch_results)
                    
                    # Count successes/failures
                    for result in batch_results:
                        if result['status'] == 'success':
                            successful += 1
                        else:
                            failed += 1
                            
                except Exception as e:
                    print(f"‚ùå Batch {batch_idx} failed: {e}")
                    failed += len(batches[batch_idx])
        
        # Summary
        print(f"\nüìä Parallel Docking Summary:")
        print(f"   Total pairs: {total_pairs}")
        print(f"   Successful: {successful}")
        print(f"   Failed: {failed}")
        print(f"   Success rate: {successful/total_pairs*100:.1f}%")
        
        return all_results
    
    def _process_batch(self, batch_df, batch_idx):
        """Process a single batch of docking pairs"""
        batch_results = []
        gnina_docker = GNINADocker(self.config)
        
        for idx, row in batch_df.iterrows():
            try:
                result = gnina_docker.run_docking(row)
                result['batch_idx'] = batch_idx
                batch_results.append(result)
                
                # Update progress
                with self.progress_lock:
                    status = "‚úÖ" if result['status'] == 'success' else "‚ùå"
                    print(f"   {status} Batch {batch_idx}: {row['receptor']}-{row['ligand']}")
                    
            except Exception as e:
                error_result = {
                    'status': 'error',
                    'error': str(e),
                    'row': row,
                    'batch_idx': batch_idx
                }
                batch_results.append(error_result)
        
        return batch_results

class ParallelTieredWorkflow(TieredGNINAWorkflow):
    """Tiered workflow with parallel processing"""
    
    def __init__(self, config, max_workers=None):
        super().__init__(config)
        self.max_workers = max_workers or min(4, mp.cpu_count())
        self.parallel_docker = ParallelGNINADocker(config, max_workers)
    
    def run_stage_parallel(self, stage, pairlist_df, previous_results=None):
        """Run a specific stage with parallel processing"""
        stage_config = self.stage_configs[stage]
        
        print(f"\nüöÄ Starting Stage {stage} (Parallel): {stage_config['description']}")
        print(f"   CNN Scoring: {stage_config['cnn_scoring']}")
        print(f"   Exhaustiveness: {stage_config['exhaustiveness']}")
        print(f"   Num Modes: {stage_config['num_modes']}")
        print(f"   Max Workers: {self.max_workers}")
        
        # Filter input based on previous stage results
        if stage == 'A':
            input_df = pairlist_df.copy()
        else:
            input_df = self._filter_for_stage(stage, previous_results, pairlist_df)
        
        if len(input_df) == 0:
            print(f"   ‚ö†Ô∏è No ligands to process for Stage {stage}")
            return []
        
        print(f"   Processing {len(input_df)} ligand-receptor pairs")
        
        # Update config for this stage
        stage_config_copy = self.config.copy()
        stage_config_copy.update({
            'cnn_scoring': stage_config['cnn_scoring'],
            'exhaustiveness': stage_config['exhaustiveness'],
            'num_modes': stage_config['num_modes']
        })
        
        # Update parallel docker config
        self.parallel_docker.config = stage_config_copy
        
        # Run parallel docking
        results = self.parallel_docker.run_docking_parallel(input_df)
        
        # Add stage information
        for result in results:
            result['stage'] = stage
            result['stage_config'] = stage_config
        
        # Save stage results
        self.stage_results[stage] = results
        self._save_stage_results(stage, results)
        
        # Print stage summary
        self._print_stage_summary(stage, results)
        
        return results
    
    def run_complete_workflow_parallel(self, pairlist_df, stages=['A', 'B']):
        """Run the complete tiered workflow with parallel processing"""
        print("üéØ Starting Parallel Tiered GNINA Workflow")
        print(f"   Stages to run: {stages}")
        print(f"   Max workers: {self.max_workers}")
        
        all_results = []
        previous_results = None
        
        for stage in stages:
            stage_results = self.run_stage_parallel(stage, pairlist_df, previous_results)
            all_results.extend(stage_results)
            previous_results = stage_results
            
            # Check if we should continue
            if stage in ['A', 'B'] and len(stage_results) == 0:
                print(f"   ‚ö†Ô∏è No results from Stage {stage}, stopping workflow")
                break
        
        # Save complete workflow results
        self._save_workflow_summary(all_results)
        
        return all_results

# Initialize parallel workflow
parallel_workflow = ParallelTieredWorkflow(CONFIG, max_workers=4)
print("‚úÖ Parallel Tiered GNINA Workflow initialized")


In [None]:
# =============================================================================
# Resume Capability Implementation
# =============================================================================

import json
import glob
from datetime import datetime

class ResumeCapableWorkflow(ParallelTieredWorkflow):
    """Workflow with resume capability for interrupted runs"""
    
    def __init__(self, config, max_workers=None, resume_file="workflow_state.json"):
        super().__init__(config, max_workers)
        self.resume_file = resume_file
        self.state_file = f"results/{resume_file}"
        
    def save_workflow_state(self, stage, completed_pairs, total_pairs, results=None):
        """Save current workflow state for resume capability"""
        state = {
            'timestamp': datetime.now().isoformat(),
            'stage': stage,
            'completed_pairs': completed_pairs,
            'total_pairs': total_pairs,
            'config': self.config,
            'stage_configs': self.stage_configs,
            'completed_stages': list(self.stage_results.keys()),
            'last_checkpoint': f"stage_{stage}_checkpoint"
        }
        
        # Save state
        with open(self.state_file, 'w') as f:
            json.dump(state, f, indent=2)
        
        # Save checkpoint results if provided
        if results:
            checkpoint_file = f"results/{state['last_checkpoint']}.json"
            with open(checkpoint_file, 'w') as f:
                json.dump(results, f, indent=2)
        
        print(f"üíæ Workflow state saved: {completed_pairs}/{total_pairs} pairs completed in stage {stage}")
    
    def load_workflow_state(self):
        """Load previous workflow state"""
        if not os.path.exists(self.state_file):
            return None
        
        try:
            with open(self.state_file, 'r') as f:
                state = json.load(f)
            
            print(f"üìÇ Found previous workflow state:")
            print(f"   Timestamp: {state['timestamp']}")
            print(f"   Stage: {state['stage']}")
            print(f"   Progress: {state['completed_pairs']}/{state['total_pairs']}")
            print(f"   Completed stages: {state['completed_stages']}")
            
            return state
            
        except Exception as e:
            print(f"‚ö†Ô∏è Error loading workflow state: {e}")
            return None
    
    def get_completed_pairs(self):
        """Get list of already completed pairs from existing results"""
        completed_pairs = set()
        
        # Check all stage result files
        for stage_dir in glob.glob("results/stage_*"):
            if os.path.isdir(stage_dir):
                stage = stage_dir.split('_')[-1]
                summary_file = f"{stage_dir}/stage_{stage}_summary.csv"
                
                if os.path.exists(summary_file):
                    try:
                        df = pd.read_csv(summary_file)
                        for _, row in df.iterrows():
                            pair_id = f"{row['receptor']}_{row['site_id']}_{row['ligand']}"
                            completed_pairs.add(pair_id)
                    except Exception as e:
                        print(f"‚ö†Ô∏è Error reading {summary_file}: {e}")
        
        return completed_pairs
    
    def filter_uncompleted_pairs(self, pairlist_df, stage):
        """Filter out already completed pairs"""
        completed_pairs = self.get_completed_pairs()
        
        if not completed_pairs:
            return pairlist_df
        
        # Create pair IDs for current pairlist
        pairlist_df['pair_id'] = (pairlist_df['receptor'] + '_' + 
                                 pairlist_df['site_id'] + '_' + 
                                 pairlist_df['ligand'])
        
        # Filter out completed pairs
        uncompleted_df = pairlist_df[~pairlist_df['pair_id'].isin(completed_pairs)].copy()
        uncompleted_df = uncompleted_df.drop('pair_id', axis=1)
        
        print(f"üìä Resume filtering for stage {stage}:")
        print(f"   Total pairs: {len(pairlist_df)}")
        print(f"   Already completed: {len(completed_pairs)}")
        print(f"   Remaining: {len(uncompleted_df)}")
        
        return uncompleted_df
    
    def run_stage_with_resume(self, stage, pairlist_df, previous_results=None):
        """Run stage with resume capability"""
        # Filter out already completed pairs
        input_df = self.filter_uncompleted_pairs(pairlist_df, stage)
        
        if len(input_df) == 0:
            print(f"   ‚úÖ All pairs for stage {stage} already completed")
            return self._load_stage_results(stage)
        
        # Run the stage
        results = self.run_stage_parallel(stage, input_df, previous_results)
        
        # Save state after completion
        self.save_workflow_state(stage, len(pairlist_df), len(pairlist_df), results)
        
        return results
    
    def _load_stage_results(self, stage):
        """Load existing stage results"""
        stage_dir = f"results/stage_{stage}"
        results_file = f"{stage_dir}/stage_{stage}_results.csv"
        
        if os.path.exists(results_file):
            try:
                df = pd.read_csv(results_file)
                # Convert back to result format (simplified)
                results = []
                for _, row in df.iterrows():
                    result = {
                        'status': 'success',  # Assume success if in results
                        'stage': stage,
                        'row': {
                            'receptor': row.get('receptor', ''),
                            'ligand': row.get('ligand', ''),
                            'site_id': row.get('site_id', '')
                        }
                    }
                    results.append(result)
                
                print(f"   üìÇ Loaded {len(results)} existing results for stage {stage}")
                return results
                
            except Exception as e:
                print(f"‚ö†Ô∏è Error loading stage {stage} results: {e}")
        
        return []
    
    def run_complete_workflow_with_resume(self, pairlist_df, stages=['A', 'B']):
        """Run complete workflow with resume capability"""
        print("üéØ Starting Resume-Capable Parallel Tiered GNINA Workflow")
        
        # Check for existing state
        state = self.load_workflow_state()
        if state:
            response = input("Found previous workflow state. Resume? (y/n): ").lower()
            if response == 'y':
                print("üîÑ Resuming previous workflow...")
                # Load existing stage results
                for stage in state['completed_stages']:
                    self.stage_results[stage] = self._load_stage_results(stage)
            else:
                print("üÜï Starting fresh workflow...")
                # Clean up old state
                if os.path.exists(self.state_file):
                    os.remove(self.state_file)
        
        all_results = []
        previous_results = None
        
        for stage in stages:
            # Check if stage already completed
            if stage in self.stage_results:
                print(f"   ‚úÖ Stage {stage} already completed, skipping...")
                previous_results = self.stage_results[stage]
                all_results.extend(previous_results)
                continue
            
            # Run stage with resume
            stage_results = self.run_stage_with_resume(stage, pairlist_df, previous_results)
            all_results.extend(stage_results)
            previous_results = stage_results
            
            # Check if we should continue
            if stage in ['A', 'B'] and len(stage_results) == 0:
                print(f"   ‚ö†Ô∏è No results from Stage {stage}, stopping workflow")
                break
        
        # Save complete workflow results
        self._save_workflow_summary(all_results)
        
        # Clean up state file after completion
        if os.path.exists(self.state_file):
            os.remove(self.state_file)
            print("üßπ Cleaned up workflow state file")
        
        return all_results

# Initialize resume-capable workflow
resume_workflow = ResumeCapableWorkflow(CONFIG, max_workers=4)
print("‚úÖ Resume-Capable Workflow initialized")


In [None]:
# =============================================================================
# Main Execution - Choose Your Workflow
# =============================================================================

def run_workflow(workflow_type="resume", stages=['A', 'B']):
    """
    Run the selected workflow type
    
    Args:
        workflow_type: "simple", "tiered", "parallel", "resume"
        stages: List of stages to run ['A', 'B', 'C']
    """
    
    if 'pairlist_df' not in locals() or pairlist_df is None:
        print("‚ùå Please ensure pairlist.csv is loaded before running workflow")
        return None
    
    print(f"üéØ Running {workflow_type.upper()} workflow with stages: {stages}")
    
    if workflow_type == "simple":
        # Simple single-stage workflow
        gnina_docker = GNINADocker(CONFIG)
        results = []
        for idx, row in tqdm(pairlist_df.iterrows(), total=len(pairlist_df), desc="Simple docking"):
            result = gnina_docker.run_docking(row)
            results.append(result)
        return results
    
    elif workflow_type == "tiered":
        # Tiered CNN workflow (sequential)
        return tiered_workflow.run_complete_workflow(pairlist_df, stages)
    
    elif workflow_type == "parallel":
        # Parallel tiered workflow
        return parallel_workflow.run_complete_workflow_parallel(pairlist_df, stages)
    
    elif workflow_type == "resume":
        # Resume-capable parallel tiered workflow
        return resume_workflow.run_complete_workflow_with_resume(pairlist_df, stages)
    
    else:
        print(f"‚ùå Unknown workflow type: {workflow_type}")
        return None

# =============================================================================
# Workflow Configuration Options
# =============================================================================

# Option 1: Quick single-stage run (rescore only)
# results = run_workflow("simple", stages=['A'])

# Option 2: Two-stage tiered workflow (recommended for most cases)
# results = run_workflow("tiered", stages=['A', 'B'])

# Option 3: Parallel two-stage workflow (faster for large datasets)
# results = run_workflow("parallel", stages=['A', 'B'])

# Option 4: Resume-capable parallel workflow (best for production)
# results = run_workflow("resume", stages=['A', 'B'])

# Option 5: Full three-stage workflow (for high-accuracy requirements)
# results = run_workflow("resume", stages=['A', 'B', 'C'])

print("üöÄ Workflow execution functions ready!")
print("\nAvailable workflow types:")
print("  1. simple    - Single-stage rescore (fastest)")
print("  2. tiered    - Two-stage CNN funnel (sequential)")
print("  3. parallel  - Two-stage CNN funnel (parallel)")
print("  4. resume    - Resume-capable parallel workflow (recommended)")
print("\nExample usage:")
print("  results = run_workflow('resume', stages=['A', 'B'])")
print("  results = run_workflow('simple', stages=['A'])")
print("  results = run_workflow('parallel', stages=['A', 'B', 'C'])")


In [None]:
# =============================================================================
# Visualization and Analysis Dashboard
# =============================================================================

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Rectangle
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.offline as pyo

class GNINAVisualizationDashboard:
    """Comprehensive visualization dashboard for GNINA results"""
    
    def __init__(self, results_dir="results"):
        self.results_dir = results_dir
        self.setup_plotting_style()
    
    def setup_plotting_style(self):
        """Setup consistent plotting style"""
        plt.style.use('default')
        sns.set_palette("husl")
        
        # Set figure size and DPI
        plt.rcParams['figure.figsize'] = (12, 8)
        plt.rcParams['figure.dpi'] = 100
        plt.rcParams['font.size'] = 10
    
    def create_workflow_overview(self, workflow_summary_file="results/workflow_summary.json"):
        """Create overview of the complete workflow"""
        if not os.path.exists(workflow_summary_file):
            print("‚ùå Workflow summary file not found")
            return
        
        with open(workflow_summary_file, 'r') as f:
            summary = json.load(f)
        
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        fig.suptitle('GNINA Workflow Overview', fontsize=16, fontweight='bold')
        
        # 1. Stages completed
        stages = summary.get('stages_completed', [])
        axes[0, 0].bar(range(len(stages)), [1]*len(stages), color=['#2E8B57', '#4169E1', '#DC143C'][:len(stages)])
        axes[0, 0].set_xticks(range(len(stages)))
        axes[0, 0].set_xticklabels([f'Stage {s}' for s in stages])
        axes[0, 0].set_title('Completed Stages')
        axes[0, 0].set_ylabel('Status')
        
        # 2. Configuration parameters
        config = summary.get('config', {})
        params = ['exhaustiveness', 'num_modes', 'cpu_cores', 'batch_size']
        values = [config.get(p, 0) for p in params]
        axes[0, 1].bar(params, values, color='skyblue')
        axes[0, 1].set_title('Configuration Parameters')
        axes[0, 1].tick_params(axis='x', rotation=45)
        
        # 3. Total pairs processed
        total_pairs = summary.get('total_pairs_processed', 0)
        axes[1, 0].pie([total_pairs], labels=['Total Pairs'], autopct='%1.0f', 
                      colors=['lightcoral'], startangle=90)
        axes[1, 0].set_title('Total Pairs Processed')
        
        # 4. Workflow efficiency
        stage_configs = summary.get('stage_configs', {})
        if stage_configs:
            stages_list = list(stage_configs.keys())
            exhaustiveness = [stage_configs[s].get('exhaustiveness', 0) for s in stages_list]
            axes[1, 1].plot(stages_list, exhaustiveness, marker='o', linewidth=2, markersize=8)
            axes[1, 1].set_title('Exhaustiveness by Stage')
            axes[1, 1].set_xlabel('Stage')
            axes[1, 1].set_ylabel('Exhaustiveness')
            axes[1, 1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(f"{self.results_dir}/workflow_overview.png", dpi=300, bbox_inches='tight')
        plt.show()
    
    def create_stage_analysis(self, stage='A'):
        """Create detailed analysis for a specific stage"""
        stage_dir = f"{self.results_dir}/stage_{stage}"
        summary_file = f"{stage_dir}/stage_{stage}_summary.csv"
        
        if not os.path.exists(summary_file):
            print(f"‚ùå Stage {stage} summary file not found")
            return
        
        df = pd.read_csv(summary_file)
        
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle(f'Stage {stage} Analysis', fontsize=16, fontweight='bold')
        
        # 1. CNN Score distribution
        axes[0, 0].hist(df['cnn_score'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
        axes[0, 0].set_title('CNN Score Distribution')
        axes[0, 0].set_xlabel('CNN Score')
        axes[0, 0].set_ylabel('Frequency')
        axes[0, 0].axvline(df['cnn_score'].mean(), color='red', linestyle='--', 
                          label=f'Mean: {df["cnn_score"].mean():.3f}')
        axes[0, 0].legend()
        
        # 2. CNN Affinity distribution
        axes[0, 1].hist(df['cnn_affinity'], bins=20, alpha=0.7, color='lightgreen', edgecolor='black')
        axes[0, 1].set_title('CNN Affinity Distribution')
        axes[0, 1].set_xlabel('CNN Affinity')
        axes[0, 1].set_ylabel('Frequency')
        axes[0, 1].axvline(df['cnn_affinity'].mean(), color='red', linestyle='--',
                          label=f'Mean: {df["cnn_affinity"].mean():.3f}')
        axes[0, 1].legend()
        
        # 3. Score vs Affinity scatter
        axes[0, 2].scatter(df['cnn_score'], df['cnn_affinity'], alpha=0.6, color='purple')
        axes[0, 2].set_title('CNN Score vs Affinity')
        axes[0, 2].set_xlabel('CNN Score')
        axes[0, 2].set_ylabel('CNN Affinity')
        
        # Add correlation coefficient
        corr = df['cnn_score'].corr(df['cnn_affinity'])
        axes[0, 2].text(0.05, 0.95, f'Correlation: {corr:.3f}', 
                       transform=axes[0, 2].transAxes, fontsize=10,
                       bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
        
        # 4. Top performers by receptor
        receptor_counts = df['receptor'].value_counts().head(10)
        axes[1, 0].barh(range(len(receptor_counts)), receptor_counts.values, color='orange')
        axes[1, 0].set_yticks(range(len(receptor_counts)))
        axes[1, 0].set_yticklabels(receptor_counts.index)
        axes[1, 0].set_title('Top 10 Receptors by Hit Count')
        axes[1, 0].set_xlabel('Number of Hits')
        
        # 5. Top performers by ligand
        ligand_counts = df['ligand'].value_counts().head(10)
        axes[1, 1].barh(range(len(ligand_counts)), ligand_counts.values, color='pink')
        axes[1, 1].set_yticks(range(len(ligand_counts)))
        axes[1, 1].set_yticklabels(ligand_counts.index)
        axes[1, 1].set_title('Top 10 Ligands by Hit Count')
        axes[1, 1].set_xlabel('Number of Hits')
        
        # 6. Score statistics
        stats_data = {
            'Metric': ['Mean', 'Median', 'Std', 'Min', 'Max'],
            'CNN Score': [
                df['cnn_score'].mean(),
                df['cnn_score'].median(),
                df['cnn_score'].std(),
                df['cnn_score'].min(),
                df['cnn_score'].max()
            ],
            'CNN Affinity': [
                df['cnn_affinity'].mean(),
                df['cnn_affinity'].median(),
                df['cnn_affinity'].std(),
                df['cnn_affinity'].min(),
                df['cnn_affinity'].max()
            ]
        }
        
        stats_df = pd.DataFrame(stats_data)
        axes[1, 2].axis('tight')
        axes[1, 2].axis('off')
        table = axes[1, 2].table(cellText=stats_df.values, colLabels=stats_df.columns,
                               cellLoc='center', loc='center')
        table.auto_set_font_size(False)
        table.set_fontsize(9)
        table.scale(1.2, 1.5)
        axes[1, 2].set_title('Score Statistics')
        
        plt.tight_layout()
        plt.savefig(f"{stage_dir}/stage_{stage}_analysis.png", dpi=300, bbox_inches='tight')
        plt.show()
    
    def create_interactive_dashboard(self):
        """Create interactive Plotly dashboard"""
        # Collect data from all stages
        all_data = []
        
        for stage_dir in glob.glob(f"{self.results_dir}/stage_*"):
            if os.path.isdir(stage_dir):
                stage = stage_dir.split('_')[-1]
                summary_file = f"{stage_dir}/stage_{stage}_summary.csv"
                
                if os.path.exists(summary_file):
                    df = pd.read_csv(summary_file)
                    df['stage'] = stage
                    all_data.append(df)
        
        if not all_data:
            print("‚ùå No stage data found for interactive dashboard")
            return
        
        combined_df = pd.concat(all_data, ignore_index=True)
        
        # Create subplots
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=('CNN Score by Stage', 'CNN Affinity by Stage', 
                          'Score vs Affinity', 'Top Performers'),
            specs=[[{"secondary_y": False}, {"secondary_y": False}],
                   [{"secondary_y": False}, {"secondary_y": False}]]
        )
        
        # 1. CNN Score by stage
        for stage in combined_df['stage'].unique():
            stage_data = combined_df[combined_df['stage'] == stage]
            fig.add_trace(
                go.Box(y=stage_data['cnn_score'], name=f'Stage {stage}'),
                row=1, col=1
            )
        
        # 2. CNN Affinity by stage
        for stage in combined_df['stage'].unique():
            stage_data = combined_df[combined_df['stage'] == stage]
            fig.add_trace(
                go.Box(y=stage_data['cnn_affinity'], name=f'Stage {stage}'),
                row=1, col=2
            )
        
        # 3. Score vs Affinity scatter
        fig.add_trace(
            go.Scatter(x=combined_df['cnn_score'], y=combined_df['cnn_affinity'],
                      mode='markers', name='All Results',
                      marker=dict(color=combined_df['stage'], 
                                colorscale='Viridis',
                                showscale=True,
                                colorbar=dict(title="Stage"))),
            row=2, col=1
        )
        
        # 4. Top performers
        top_performers = combined_df.nlargest(20, 'cnn_score')
        fig.add_trace(
            go.Bar(x=top_performers['cnn_score'],
                  y=[f"{row['receptor']}-{row['ligand']}" for _, row in top_performers.iterrows()],
                  orientation='h', name='Top 20'),
            row=2, col=2
        )
        
        # Update layout
        fig.update_layout(
            title_text="GNINA Interactive Dashboard",
            showlegend=False,
            height=800
        )
        
        # Save and show
        fig.write_html(f"{self.results_dir}/interactive_dashboard.html")
        fig.show()
        
        print(f"üìä Interactive dashboard saved to {self.results_dir}/interactive_dashboard.html")
    
    def create_comparison_plot(self, stages=['A', 'B']):
        """Create comparison plot between stages"""
        stage_data = {}
        
        for stage in stages:
            stage_dir = f"{self.results_dir}/stage_{stage}"
            summary_file = f"{stage_dir}/stage_{stage}_summary.csv"
            
            if os.path.exists(summary_file):
                stage_data[stage] = pd.read_csv(summary_file)
        
        if len(stage_data) < 2:
            print("‚ùå Need at least 2 stages for comparison")
            return
        
        fig, axes = plt.subplots(1, 2, figsize=(15, 6))
        fig.suptitle('Stage Comparison Analysis', fontsize=16, fontweight='bold')
        
        # 1. Score comparison
        for stage, df in stage_data.items():
            axes[0].hist(df['cnn_score'], alpha=0.6, label=f'Stage {stage}', bins=20)
        axes[0].set_title('CNN Score Distribution Comparison')
        axes[0].set_xlabel('CNN Score')
        axes[0].set_ylabel('Frequency')
        axes[0].legend()
        axes[0].grid(True, alpha=0.3)
        
        # 2. Affinity comparison
        for stage, df in stage_data.items():
            axes[1].hist(df['cnn_affinity'], alpha=0.6, label=f'Stage {stage}', bins=20)
        axes[1].set_title('CNN Affinity Distribution Comparison')
        axes[1].set_xlabel('CNN Affinity')
        axes[1].set_ylabel('Frequency')
        axes[1].legend()
        axes[1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(f"{self.results_dir}/stage_comparison.png", dpi=300, bbox_inches='tight')
        plt.show()

# Initialize visualization dashboard
viz_dashboard = GNINAVisualizationDashboard()
print("‚úÖ Visualization Dashboard initialized")


In [None]:
# =============================================================================
# Execute Workflow and Generate Visualizations
# =============================================================================

# Uncomment the workflow you want to run:

# Option 1: Quick single-stage run (recommended for testing)
# results = run_workflow("simple", stages=['A'])

# Option 2: Two-stage tiered workflow (recommended for most cases)
# results = run_workflow("tiered", stages=['A', 'B'])

# Option 3: Parallel two-stage workflow (faster for large datasets)
# results = run_workflow("parallel", stages=['A', 'B'])

# Option 4: Resume-capable parallel workflow (best for production)
# results = run_workflow("resume", stages=['A', 'B'])

# Option 5: Full three-stage workflow (for high-accuracy requirements)
# results = run_workflow("resume", stages=['A', 'B', 'C'])

# =============================================================================
# Generate Visualizations (run after workflow completion)
# =============================================================================

def generate_all_visualizations():
    """Generate all available visualizations"""
    print("üìä Generating comprehensive visualizations...")
    
    # 1. Workflow overview
    print("   Creating workflow overview...")
    viz_dashboard.create_workflow_overview()
    
    # 2. Stage analysis for each completed stage
    for stage_dir in glob.glob("results/stage_*"):
        if os.path.isdir(stage_dir):
            stage = stage_dir.split('_')[-1]
            print(f"   Creating stage {stage} analysis...")
            viz_dashboard.create_stage_analysis(stage)
    
    # 3. Interactive dashboard
    print("   Creating interactive dashboard...")
    viz_dashboard.create_interactive_dashboard()
    
    # 4. Stage comparison (if multiple stages)
    completed_stages = [d.split('_')[-1] for d in glob.glob("results/stage_*") if os.path.isdir(d)]
    if len(completed_stages) > 1:
        print("   Creating stage comparison...")
        viz_dashboard.create_comparison_plot(completed_stages)
    
    print("‚úÖ All visualizations generated!")

# Uncomment to generate visualizations after running workflow:
# generate_all_visualizations()

print("üéØ Ready to execute workflow!")
print("\nTo run a workflow, uncomment one of the options above:")
print("  - Simple: run_workflow('simple', stages=['A'])")
print("  - Tiered: run_workflow('tiered', stages=['A', 'B'])")
print("  - Parallel: run_workflow('parallel', stages=['A', 'B'])")
print("  - Resume: run_workflow('resume', stages=['A', 'B'])")
print("\nTo generate visualizations after completion:")
print("  generate_all_visualizations()")


In [None]:
# =============================================================================
# Phase 2: Flexible Receptor Docking Implementation
# =============================================================================

class FlexibleReceptorDocker(GNINADocker):
    """Enhanced GNINA docker with flexible receptor capabilities"""
    
    def __init__(self, config):
        super().__init__(config)
        self.flexible_residues = {}
        self.flexible_configs = {
            'auto_detect': True,           # Auto-detect flexible residues
            'distance_threshold': 5.0,     # Distance from binding site (√Ö)
            'max_flexible_residues': 20,   # Maximum flexible residues
            'flexible_chains': [],         # Specific chains to make flexible
            'exclude_residues': [],        # Residues to exclude from flexibility
            'flexdist': 3.5,              # GNINA flexdist parameter
            'flexres': None               # Manual flexible residue specification
        }
    
    def detect_flexible_residues(self, receptor_pdb, binding_center, distance_threshold=5.0):
        """Auto-detect flexible residues near binding site"""
        try:
            from Bio.PDB import PDBParser, NeighborSearch
            from Bio.PDB.PDBExceptions import PDBConstructionWarning
            import warnings
            warnings.simplefilter('ignore', PDBConstructionWarning)
            
            parser = PDBParser(QUIET=True)
            structure = parser.get_structure('receptor', receptor_pdb)
            
            # Get all atoms
            atoms = []
            for model in structure:
                for chain in model:
                    for residue in chain:
                        for atom in residue:
                            atoms.append(atom)
            
            # Create neighbor search
            ns = NeighborSearch(atoms)
            
            # Find residues within distance of binding center
            center_atom = None
            min_distance = float('inf')
            
            for atom in atoms:
                dist = atom.coord - np.array(binding_center)
                dist = np.linalg.norm(dist)
                if dist < min_distance:
                    min_distance = dist
                    center_atom = atom
            
            if center_atom is None:
                return []
            
            # Find neighbors within threshold
            neighbors = ns.search(center_atom.coord, distance_threshold, level='R')
            
            flexible_residues = []
            for residue in neighbors:
                chain_id = residue.parent.id
                res_num = residue.id[1]
                res_name = residue.resname
                flexible_residues.append(f"{chain_id}:{res_num}")
            
            return sorted(flexible_residues)
            
        except ImportError:
            print("‚ö†Ô∏è BioPython not available for auto-detection. Using manual specification.")
            return []
        except Exception as e:
            print(f"‚ö†Ô∏è Error in auto-detection: {e}")
            return []
    
    def set_flexible_residues(self, receptor, flexible_residues=None, auto_detect=True, 
                            binding_center=None, distance_threshold=5.0):
        """Set flexible residues for a receptor"""
        
        if flexible_residues is not None:
            # Manual specification
            self.flexible_residues[receptor] = flexible_residues
            print(f"‚úÖ Set manual flexible residues for {receptor}: {flexible_residues}")
            
        elif auto_detect and binding_center is not None:
            # Auto-detection
            receptor_pdb = f"receptors_prep/{receptor}.pdbqt"
            if os.path.exists(receptor_pdb):
                detected = self.detect_flexible_residues(receptor_pdb, binding_center, distance_threshold)
                
                # Apply limits
                if len(detected) > self.flexible_configs['max_flexible_residues']:
                    detected = detected[:self.flexible_configs['max_flexible_residues']]
                    print(f"‚ö†Ô∏è Limited flexible residues to {self.flexible_configs['max_flexible_residues']}")
                
                self.flexible_residues[receptor] = detected
                print(f"‚úÖ Auto-detected {len(detected)} flexible residues for {receptor}: {detected}")
            else:
                print(f"‚ö†Ô∏è Receptor file not found: {receptor_pdb}")
                self.flexible_residues[receptor] = []
        else:
            print(f"‚ö†Ô∏è No flexible residues set for {receptor}")
            self.flexible_residues[receptor] = []
    
    def build_gnina_command(self, row):
        """Build GNINA command with flexible receptor support"""
        receptor = f"receptors_prep/{row['receptor']}.pdbqt"
        ligand = f"ligands_prep/{row['ligand']}.pdbqt"
        
        # Output files
        tag = f"{row['receptor']}_{row['site_id']}_{row['ligand']}"
        output_sdf = f"gnina_out/{tag}_poses.sdf"
        log_file = f"logs/{tag}.log"
        flex_output = f"gnina_out/{tag}_flex.pdbqt"
        
        # Base command
        cmd = [
            self.gnina_bin,
            "--receptor", receptor,
            "--ligand", ligand,
            "--out", output_sdf,
            "--log", log_file,
        ]
        
        # Docking box parameters
        cmd.extend([
            "--center_x", str(row['center_x']),
            "--center_y", str(row['center_y']),
            "--center_z", str(row['center_z']),
            "--size_x", str(row['size_x']),
            "--size_y", str(row['size_y']),
            "--size_z", str(row['size_z']),
        ])
        
        # Performance parameters
        cmd.extend([
            "--exhaustiveness", str(self.config['exhaustiveness']),
            "--num_modes", str(self.config['num_modes']),
            "--seed", str(self.config['seed']),
            "--cpu", str(self.config['cpu_cores']),
        ])
        
        # Advanced GNINA features
        cmd.extend([
            "--cnn_scoring", self.config['cnn_scoring'],
            "--cnn_rotation", "0",
            "--min_rmsd_filter", "1.0",
            "--pose_sort_order", "0",
        ])
        
        # Flexible receptor parameters
        if row['receptor'] in self.flexible_residues:
            flex_residues = self.flexible_residues[row['receptor']]
            if flex_residues:
                flexres_str = ",".join(flex_residues)
                cmd.extend([
                    "--flexres", flexres_str,
                    "--flexdist", str(self.flexible_configs['flexdist']),
                    "--out_flex", flex_output
                ])
                print(f"   üîÑ Using flexible residues: {flexres_str}")
        
        # GPU support
        if self.config['use_gpu']:
            cmd.append("--gpu")
            cmd.extend(["--device", "0"])
        
        return cmd, output_sdf, log_file, flex_output if row['receptor'] in self.flexible_residues else None

class FlexibleTieredWorkflow(ResumeCapableWorkflow):
    """Tiered workflow with flexible receptor support"""
    
    def __init__(self, config, max_workers=None, resume_file="workflow_state.json"):
        super().__init__(config, max_workers, resume_file)
        self.flexible_docker = FlexibleReceptorDocker(config)
    
    def configure_flexible_docking(self, flexible_config):
        """Configure flexible docking parameters"""
        self.flexible_docker.flexible_configs.update(flexible_config)
        print("‚úÖ Flexible docking configuration updated")
    
    def set_receptor_flexibility(self, receptor, flexible_residues=None, 
                               auto_detect=True, binding_center=None):
        """Set flexible residues for a specific receptor"""
        self.flexible_docker.set_flexible_residues(
            receptor, flexible_residues, auto_detect, binding_center
        )
    
    def set_bulk_flexibility(self, pairlist_df, auto_detect=True):
        """Set flexible residues for all receptors in pairlist"""
        print("üîÑ Configuring flexible residues for all receptors...")
        
        for _, row in pairlist_df.iterrows():
            receptor = row['receptor']
            if receptor not in self.flexible_docker.flexible_residues:
                binding_center = [row['center_x'], row['center_y'], row['center_z']]
                self.flexible_docker.set_flexible_residues(
                    receptor, auto_detect=auto_detect, binding_center=binding_center
                )
    
    def run_stage_parallel(self, stage, pairlist_df, previous_results=None):
        """Run stage with flexible receptor support"""
        stage_config = self.stage_configs[stage]
        
        print(f"\nüöÄ Starting Stage {stage} (Flexible): {stage_config['description']}")
        print(f"   CNN Scoring: {stage_config['cnn_scoring']}")
        print(f"   Exhaustiveness: {stage_config['exhaustiveness']}")
        print(f"   Num Modes: {stage_config['num_modes']}")
        print(f"   Max Workers: {self.max_workers}")
        
        # Filter input based on previous stage results
        if stage == 'A':
            input_df = pairlist_df.copy()
        else:
            input_df = self._filter_for_stage(stage, previous_results, pairlist_df)
        
        if len(input_df) == 0:
            print(f"   ‚ö†Ô∏è No ligands to process for Stage {stage}")
            return []
        
        print(f"   Processing {len(input_df)} ligand-receptor pairs")
        
        # Update config for this stage
        stage_config_copy = self.config.copy()
        stage_config_copy.update({
            'cnn_scoring': stage_config['cnn_scoring'],
            'exhaustiveness': stage_config['exhaustiveness'],
            'num_modes': stage_config['num_modes']
        })
        
        # Update flexible docker config
        self.flexible_docker.config = stage_config_copy
        
        # Run parallel docking with flexible receptors
        results = self._run_flexible_parallel_docking(input_df)
        
        # Add stage information
        for result in results:
            result['stage'] = stage
            result['stage_config'] = stage_config
        
        # Save stage results
        self.stage_results[stage] = results
        self._save_stage_results(stage, results)
        
        # Print stage summary
        self._print_stage_summary(stage, results)
        
        return results
    
    def _run_flexible_parallel_docking(self, pairlist_df, batch_size=None):
        """Run flexible docking with parallel processing"""
        if batch_size is None:
            batch_size = self.config.get('batch_size', 5)
        
        total_pairs = len(pairlist_df)
        print(f"üöÄ Starting flexible parallel docking: {total_pairs} pairs")
        print(f"   Max workers: {self.max_workers}")
        print(f"   Batch size: {batch_size}")
        
        # Split into batches
        batches = [pairlist_df.iloc[i:i+batch_size] for i in range(0, total_pairs, batch_size)]
        
        all_results = []
        successful = 0
        failed = 0
        
        # Process batches in parallel
        with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            # Submit all batches
            future_to_batch = {
                executor.submit(self._process_flexible_batch, batch, batch_idx): batch_idx 
                for batch_idx, batch in enumerate(batches)
            }
            
            # Collect results as they complete
            for future in tqdm(concurrent.futures.as_completed(future_to_batch), 
                             total=len(batches), desc="Processing flexible batches"):
                batch_idx = future_to_batch[future]
                try:
                    batch_results = future.result()
                    all_results.extend(batch_results)
                    
                    # Count successes/failures
                    for result in batch_results:
                        if result['status'] == 'success':
                            successful += 1
                        else:
                            failed += 1
                            
                except Exception as e:
                    print(f"‚ùå Flexible batch {batch_idx} failed: {e}")
                    failed += len(batches[batch_idx])
        
        # Summary
        print(f"\nüìä Flexible Parallel Docking Summary:")
        print(f"   Total pairs: {total_pairs}")
        print(f"   Successful: {successful}")
        print(f"   Failed: {failed}")
        print(f"   Success rate: {successful/total_pairs*100:.1f}%")
        
        return all_results
    
    def _process_flexible_batch(self, batch_df, batch_idx):
        """Process a single batch with flexible docking"""
        batch_results = []
        
        for idx, row in batch_df.iterrows():
            try:
                result = self.flexible_docker.run_docking(row)
                result['batch_idx'] = batch_idx
                batch_results.append(result)
                
                # Update progress
                with self.progress_lock:
                    status = "‚úÖ" if result['status'] == 'success' else "‚ùå"
                    flex_info = " (Flex)" if row['receptor'] in self.flexible_docker.flexible_residues else ""
                    print(f"   {status} Batch {batch_idx}: {row['receptor']}-{row['ligand']}{flex_info}")
                    
            except Exception as e:
                error_result = {
                    'status': 'error',
                    'error': str(e),
                    'row': row,
                    'batch_idx': batch_idx
                }
                batch_results.append(error_result)
        
        return batch_results

# Initialize flexible workflow
flexible_workflow = FlexibleTieredWorkflow(CONFIG, max_workers=4)
print("‚úÖ Flexible Receptor Workflow initialized")


In [None]:
# =============================================================================
# PDB Preparation Wizard Integration
# =============================================================================

class PDBPreparationIntegration:
    """Integration with PDB preparation wizard for enhanced preprocessing"""
    
    def __init__(self, config):
        self.config = config
        self.pdb_wizard_config = {
            'force_field': 'AMBER',
            'ph': 7.4,
            'plip_enabled': True,
            'clean_structure': True,
            'add_hydrogens': True,
            'optimize_geometry': False,
            'validate_structures': True
        }
    
    def download_pdb_wizard(self):
        """Download and setup PDB preparation wizard"""
        print("üì• Downloading PDB preparation wizard...")
        
        # Clone the repository
        if not os.path.exists("pdb-prepare-wizard"):
            !git clone https://github.com/OASolliman590/pdb-prepare-wizard.git
        else:
            print("‚úÖ PDB preparation wizard already exists")
        
        # Install dependencies
        print("üì¶ Installing PDB wizard dependencies...")
        %pip install -q biopython plip pdb2pqr
        
        print("‚úÖ PDB preparation wizard setup complete")
    
    def prepare_ligands_with_wizard(self, ligands_dir="ligands_raw", output_dir="ligands_prep"):
        """Prepare ligands using the PDB wizard"""
        print("üß™ Preparing ligands with PDB wizard...")
        
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)
        
        # Use the wizard's ligand preparation
        wizard_script = """
import sys
sys.path.append('pdb-prepare-wizard')
from autodock_preparation import AutoDockPreparationPipeline, PreparationConfig

# Create configuration
config = PreparationConfig(
    ligands_input='{ligands_dir}',
    receptors_input='receptors_raw',  # Dummy, not used for ligands
    ligands_output='{output_dir}',
    receptors_output='receptors_prep',  # Dummy, not used for ligands
    force_field='{force_field}',
    ph={ph},
    plip_enabled={plip_enabled}
)

# Initialize and run pipeline
pipeline = AutoDockPreparationPipeline(config)
success = pipeline.run_enhanced_preparation()

if success:
    print("‚úÖ Ligand preparation completed successfully")
else:
    print("‚ùå Ligand preparation failed")
""".format(
            ligands_dir=ligands_dir,
            output_dir=output_dir,
            force_field=self.pdb_wizard_config['force_field'],
            ph=self.pdb_wizard_config['ph'],
            plip_enabled=self.pdb_wizard_config['plip_enabled']
        )
        
        # Write and execute script
        with open('prepare_ligands.py', 'w') as f:
            f.write(wizard_script)
        
        try:
            !python prepare_ligands.py
            print("‚úÖ Ligands prepared successfully")
        except Exception as e:
            print(f"‚ùå Error preparing ligands: {e}")
            # Fallback to basic preparation
            self._fallback_ligand_preparation(ligands_dir, output_dir)
    
    def prepare_receptors_with_wizard(self, receptors_dir="receptors_raw", output_dir="receptors_prep"):
        """Prepare receptors using the PDB wizard"""
        print("üß¨ Preparing receptors with PDB wizard...")
        
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)
        
        # Use the wizard's receptor preparation
        wizard_script = """
import sys
sys.path.append('pdb-prepare-wizard')
from autodock_preparation import AutoDockPreparationPipeline, PreparationConfig

# Create configuration
config = PreparationConfig(
    ligands_input='ligands_raw',  # Dummy, not used for receptors
    receptors_input='{receptors_dir}',
    ligands_output='ligands_prep',  # Dummy, not used for receptors
    receptors_output='{output_dir}',
    force_field='{force_field}',
    ph={ph},
    plip_enabled={plip_enabled}
)

# Initialize and run pipeline
pipeline = AutoDockPreparationPipeline(config)
success = pipeline.run_enhanced_preparation()

if success:
    results = pipeline.analyze_preparation_results('{output_dir}')
    print(f"‚úÖ Receptors prepared: {{results['receptors']['count']}}")
else:
    print("‚ùå Receptor preparation failed")
""".format(
            receptors_dir=receptors_dir,
            output_dir=output_dir,
            force_field=self.pdb_wizard_config['force_field'],
            ph=self.pdb_wizard_config['ph'],
            plip_enabled=self.pdb_wizard_config['plip_enabled']
        )
        
        # Write and execute script
        with open('prepare_receptors.py', 'w') as f:
            f.write(wizard_script)
        
        try:
            !python prepare_receptors.py
            print("‚úÖ Receptors prepared successfully")
        except Exception as e:
            print(f"‚ùå Error preparing receptors: {e}")
            # Fallback to basic preparation
            self._fallback_receptor_preparation(receptors_dir, output_dir)
    
    def _fallback_ligand_preparation(self, ligands_dir, output_dir):
        """Fallback ligand preparation using basic tools"""
        print("üîÑ Using fallback ligand preparation...")
        
        for ligand_file in os.listdir(ligands_dir):
            if ligand_file.endswith(('.sdf', '.mol2', '.pdb')):
                input_path = os.path.join(ligands_dir, ligand_file)
                output_path = os.path.join(output_dir, ligand_file.replace('.sdf', '.pdbqt').replace('.mol2', '.pdbqt').replace('.pdb', '.pdbqt'))
                
                try:
                    # Use meeko for preparation
                    !mk_prepare_ligand.py -i "{input_path}" -o "{output_path}"
                except:
                    print(f"‚ö†Ô∏è Failed to prepare {ligand_file}")
    
    def _fallback_receptor_preparation(self, receptors_dir, output_dir):
        """Fallback receptor preparation using basic tools"""
        print("üîÑ Using fallback receptor preparation...")
        
        for receptor_file in os.listdir(receptors_dir):
            if receptor_file.endswith('.pdb'):
                input_path = os.path.join(receptors_dir, receptor_file)
                output_path = os.path.join(output_dir, receptor_file.replace('.pdb', '.pdbqt'))
                
                try:
                    # Use meeko for preparation
                    !mk_prepare_receptor.py --read_pdb "{input_path}" -p "{output_path}" --allow_bad_res --default_altloc A
                except:
                    print(f"‚ö†Ô∏è Failed to prepare {receptor_file}")
    
    def analyze_preparation_results(self, prep_dir):
        """Analyze preparation results"""
        print(f"üìä Analyzing preparation results in {prep_dir}...")
        
        if not os.path.exists(prep_dir):
            print(f"‚ùå Directory {prep_dir} not found")
            return None
        
        # Count prepared files
        pdbqt_files = [f for f in os.listdir(prep_dir) if f.endswith('.pdbqt')]
        
        results = {
            'total_files': len(pdbqt_files),
            'successful_preparations': len(pdbqt_files),
            'failed_preparations': 0,
            'file_list': pdbqt_files
        }
        
        print(f"‚úÖ Preparation analysis complete:")
        print(f"   Total files: {results['total_files']}")
        print(f"   Successful: {results['successful_preparations']}")
        print(f"   Failed: {results['failed_preparations']}")
        
        return results
    
    def validate_prepared_structures(self, prep_dir):
        """Validate prepared structures"""
        print(f"üîç Validating prepared structures in {prep_dir}...")
        
        validation_results = {
            'valid_structures': [],
            'invalid_structures': [],
            'validation_errors': []
        }
        
        for pdbqt_file in os.listdir(prep_dir):
            if pdbqt_file.endswith('.pdbqt'):
                file_path = os.path.join(prep_dir, pdbqt_file)
                
                try:
                    # Basic validation - check if file is not empty and has proper format
                    with open(file_path, 'r') as f:
                        content = f.read()
                        
                    if len(content.strip()) > 0 and 'ATOM' in content:
                        validation_results['valid_structures'].append(pdbqt_file)
                    else:
                        validation_results['invalid_structures'].append(pdbqt_file)
                        validation_results['validation_errors'].append(f"{pdbqt_file}: Empty or invalid format")
                        
                except Exception as e:
                    validation_results['invalid_structures'].append(pdbqt_file)
                    validation_results['validation_errors'].append(f"{pdbqt_file}: {str(e)}")
        
        print(f"‚úÖ Validation complete:")
        print(f"   Valid structures: {len(validation_results['valid_structures'])}")
        print(f"   Invalid structures: {len(validation_results['invalid_structures'])}")
        
        if validation_results['validation_errors']:
            print("‚ö†Ô∏è Validation errors:")
            for error in validation_results['validation_errors']:
                print(f"   - {error}")
        
        return validation_results

class EnhancedGNINAWorkflow(FlexibleTieredWorkflow):
    """Complete GNINA workflow with PDB preparation wizard integration"""
    
    def __init__(self, config, max_workers=None, resume_file="workflow_state.json"):
        super().__init__(config, max_workers, resume_file)
        self.pdb_integration = PDBPreparationIntegration(config)
    
    def setup_complete_workflow(self, pairlist_df, use_pdb_wizard=True):
        """Setup complete workflow with PDB preparation"""
        print("üöÄ Setting up complete GNINA workflow...")
        
        if use_pdb_wizard:
            print("üì• Setting up PDB preparation wizard...")
            self.pdb_integration.download_pdb_wizard()
            
            print("üß™ Preparing ligands with wizard...")
            self.pdb_integration.prepare_ligands_with_wizard()
            
            print("üß¨ Preparing receptors with wizard...")
            self.pdb_integration.prepare_receptors_with_wizard()
            
            # Validate preparations
            ligand_validation = self.pdb_integration.validate_prepared_structures("ligands_prep")
            receptor_validation = self.pdb_integration.validate_prepared_structures("receptors_prep")
            
        else:
            print("‚ö†Ô∏è Skipping PDB wizard, using basic preparation")
        
        # Configure flexible docking
        print("üîÑ Configuring flexible receptor docking...")
        self.set_bulk_flexibility(pairlist_df, auto_detect=True)
        
        print("‚úÖ Complete workflow setup finished!")
    
    def run_enhanced_workflow(self, pairlist_df, stages=['A', 'B'], use_pdb_wizard=True):
        """Run the complete enhanced workflow"""
        print("üéØ Starting Enhanced GNINA Workflow with PDB Wizard Integration")
        
        # Setup workflow
        self.setup_complete_workflow(pairlist_df, use_pdb_wizard)
        
        # Run the tiered workflow
        results = self.run_complete_workflow_with_resume(pairlist_df, stages)
        
        # Generate enhanced analysis
        self.generate_enhanced_analysis(results)
        
        return results
    
    def generate_enhanced_analysis(self, results):
        """Generate enhanced analysis with PDB wizard integration"""
        print("üìä Generating enhanced analysis...")
        
        # Create analysis directory
        analysis_dir = "enhanced_analysis"
        os.makedirs(analysis_dir, exist_ok=True)
        
        # Analyze preparation quality
        if os.path.exists("ligands_prep"):
            ligand_analysis = self.pdb_integration.analyze_preparation_results("ligands_prep")
            with open(f"{analysis_dir}/ligand_preparation_analysis.json", 'w') as f:
                json.dump(ligand_analysis, f, indent=2)
        
        if os.path.exists("receptors_prep"):
            receptor_analysis = self.pdb_integration.analyze_preparation_results("receptors_prep")
            with open(f"{analysis_dir}/receptor_preparation_analysis.json", 'w') as f:
                json.dump(receptor_analysis, f, indent=2)
        
        # Generate comprehensive report
        self._generate_comprehensive_report(results, analysis_dir)
        
        print("‚úÖ Enhanced analysis complete!")
    
    def _generate_comprehensive_report(self, results, analysis_dir):
        """Generate comprehensive analysis report"""
        report = {
            'workflow_summary': {
                'total_results': len(results),
                'successful_dockings': len([r for r in results if r['status'] == 'success']),
                'failed_dockings': len([r for r in results if r['status'] != 'success']),
                'stages_completed': list(self.stage_results.keys()),
                'flexible_receptors_used': len(self.flexible_docker.flexible_residues)
            },
            'preparation_quality': {
                'pdb_wizard_used': True,
                'ligand_preparation': 'Enhanced with PDB wizard',
                'receptor_preparation': 'Enhanced with PDB wizard'
            },
            'performance_metrics': {
                'parallel_processing': True,
                'max_workers': self.max_workers,
                'resume_capability': True
            }
        }
        
        with open(f"{analysis_dir}/comprehensive_report.json", 'w') as f:
            json.dump(report, f, indent=2)

# Initialize enhanced workflow
enhanced_workflow = EnhancedGNINAWorkflow(CONFIG, max_workers=4)
print("‚úÖ Enhanced GNINA Workflow with PDB Wizard Integration initialized")


In [None]:
# =============================================================================
# Phase 2: Enhanced Workflow Execution
# =============================================================================

def run_enhanced_workflow(workflow_type="enhanced", stages=['A', 'B'], use_pdb_wizard=True):
    """
    Run the enhanced workflow with flexible receptors and PDB wizard integration
    
    Args:
        workflow_type: "enhanced", "flexible", "basic"
        stages: List of stages to run ['A', 'B', 'C']
        use_pdb_wizard: Whether to use PDB preparation wizard
    """
    
    if 'pairlist_df' not in locals() or pairlist_df is None:
        print("‚ùå Please ensure pairlist.csv is loaded before running workflow")
        return None
    
    print(f"üéØ Running {workflow_type.upper()} workflow with stages: {stages}")
    print(f"   PDB Wizard: {'Enabled' if use_pdb_wizard else 'Disabled'}")
    
    if workflow_type == "enhanced":
        # Complete enhanced workflow with PDB wizard and flexible receptors
        return enhanced_workflow.run_enhanced_workflow(pairlist_df, stages, use_pdb_wizard)
    
    elif workflow_type == "flexible":
        # Flexible receptor workflow without PDB wizard
        flexible_workflow.set_bulk_flexibility(pairlist_df, auto_detect=True)
        return flexible_workflow.run_complete_workflow_with_resume(pairlist_df, stages)
    
    elif workflow_type == "basic":
        # Basic workflow without enhancements
        return resume_workflow.run_complete_workflow_with_resume(pairlist_df, stages)
    
    else:
        print(f"‚ùå Unknown workflow type: {workflow_type}")
        return None

# =============================================================================
# Flexible Receptor Configuration Examples
# =============================================================================

def configure_flexible_docking_examples():
    """Examples of how to configure flexible docking"""
    
    print("üîß Flexible Docking Configuration Examples:")
    print("\n1. Auto-detect flexible residues (recommended):")
    print("   flexible_workflow.set_bulk_flexibility(pairlist_df, auto_detect=True)")
    
    print("\n2. Manual specification for specific receptor:")
    print("   flexible_workflow.set_receptor_flexibility(")
    print("       'receptor_name',")
    print("       flexible_residues=['A:123', 'A:124', 'A:125']")
    print("   )")
    
    print("\n3. Configure flexible docking parameters:")
    print("   flexible_workflow.configure_flexible_docking({")
    print("       'distance_threshold': 6.0,      # Distance from binding site")
    print("       'max_flexible_residues': 15,    # Maximum flexible residues")
    print("       'flexdist': 4.0                 # GNINA flexdist parameter")
    print("   })")
    
    print("\n4. Run enhanced workflow with flexible receptors:")
    print("   results = run_enhanced_workflow('enhanced', stages=['A', 'B'])")

# =============================================================================
# Quality Control and Validation
# =============================================================================

class QualityControlValidator:
    """Quality control and validation for the enhanced workflow"""
    
    def __init__(self):
        self.validation_results = {}
    
    def validate_input_files(self, pairlist_df):
        """Validate input files and structure"""
        print("üîç Validating input files...")
        
        validation_results = {
            'pairlist_valid': True,
            'missing_receptors': [],
            'missing_ligands': [],
            'invalid_coordinates': [],
            'warnings': []
        }
        
        # Check pairlist structure
        required_cols = ['receptor', 'ligand', 'center_x', 'center_y', 'center_z', 
                        'size_x', 'size_y', 'size_z']
        missing_cols = [col for col in required_cols if col not in pairlist_df.columns]
        
        if missing_cols:
            validation_results['pairlist_valid'] = False
            validation_results['warnings'].append(f"Missing columns: {missing_cols}")
        
        # Check file existence
        for _, row in pairlist_df.iterrows():
            receptor_file = f"receptors_prep/{row['receptor']}.pdbqt"
            ligand_file = f"ligands_prep/{row['ligand']}.pdbqt"
            
            if not os.path.exists(receptor_file):
                validation_results['missing_receptors'].append(row['receptor'])
            
            if not os.path.exists(ligand_file):
                validation_results['missing_ligands'].append(row['ligand'])
            
            # Check coordinate validity
            coords = [row['center_x'], row['center_y'], row['center_z']]
            sizes = [row['size_x'], row['size_y'], row['size_z']]
            
            if any(not isinstance(c, (int, float)) or np.isnan(c) for c in coords + sizes):
                validation_results['invalid_coordinates'].append(f"{row['receptor']}-{row['ligand']}")
        
        # Print validation summary
        print(f"‚úÖ Pairlist valid: {validation_results['pairlist_valid']}")
        print(f"   Missing receptors: {len(validation_results['missing_receptors'])}")
        print(f"   Missing ligands: {len(validation_results['missing_ligands'])}")
        print(f"   Invalid coordinates: {len(validation_results['invalid_coordinates'])}")
        
        if validation_results['warnings']:
            print("‚ö†Ô∏è Warnings:")
            for warning in validation_results['warnings']:
                print(f"   - {warning}")
        
        self.validation_results['input_validation'] = validation_results
        return validation_results
    
    def validate_docking_results(self, results):
        """Validate docking results quality"""
        print("üîç Validating docking results...")
        
        validation_results = {
            'total_results': len(results),
            'successful_dockings': 0,
            'failed_dockings': 0,
            'quality_issues': [],
            'score_distribution': {}
        }
        
        successful_results = [r for r in results if r['status'] == 'success']
        validation_results['successful_dockings'] = len(successful_results)
        validation_results['failed_dockings'] = len(results) - len(successful_results)
        
        if successful_results:
            # Analyze score distribution
            all_scores = []
            for result in successful_results:
                scores = result.get('scores', [])
                if scores:
                    all_scores.extend([s.get('cnn_score', 0) for s in scores])
            
            if all_scores:
                validation_results['score_distribution'] = {
                    'mean': np.mean(all_scores),
                    'std': np.std(all_scores),
                    'min': np.min(all_scores),
                    'max': np.max(all_scores),
                    'median': np.median(all_scores)
                }
                
                # Quality checks
                if np.mean(all_scores) < 0.3:
                    validation_results['quality_issues'].append("Low average CNN scores")
                
                if np.std(all_scores) < 0.1:
                    validation_results['quality_issues'].append("Low score variance - possible issues")
        
        # Print validation summary
        print(f"‚úÖ Total results: {validation_results['total_results']}")
        print(f"   Successful: {validation_results['successful_dockings']}")
        print(f"   Failed: {validation_results['failed_dockings']}")
        
        if validation_results['score_distribution']:
            dist = validation_results['score_distribution']
            print(f"   Score distribution:")
            print(f"     Mean: {dist['mean']:.3f}")
            print(f"     Std: {dist['std']:.3f}")
            print(f"     Range: {dist['min']:.3f} - {dist['max']:.3f}")
        
        if validation_results['quality_issues']:
            print("‚ö†Ô∏è Quality issues detected:")
            for issue in validation_results['quality_issues']:
                print(f"   - {issue}")
        
        self.validation_results['docking_validation'] = validation_results
        return validation_results
    
    def generate_quality_report(self):
        """Generate comprehensive quality report"""
        print("üìä Generating quality control report...")
        
        report = {
            'timestamp': datetime.now().isoformat(),
            'validation_results': self.validation_results,
            'recommendations': []
        }
        
        # Generate recommendations based on validation results
        if 'input_validation' in self.validation_results:
            input_val = self.validation_results['input_validation']
            if input_val['missing_receptors']:
                report['recommendations'].append("Check receptor file preparation")
            if input_val['missing_ligands']:
                report['recommendations'].append("Check ligand file preparation")
            if input_val['invalid_coordinates']:
                report['recommendations'].append("Validate binding site coordinates")
        
        if 'docking_validation' in self.validation_results:
            dock_val = self.validation_results['docking_validation']
            if dock_val['failed_dockings'] > dock_val['successful_dockings']:
                report['recommendations'].append("High failure rate - check input structures")
            if dock_val['quality_issues']:
                report['recommendations'].append("Review docking parameters and structures")
        
        # Save report
        with open('quality_control_report.json', 'w') as f:
            json.dump(report, f, indent=2)
        
        print("‚úÖ Quality control report saved to quality_control_report.json")
        return report

# Initialize quality control validator
qc_validator = QualityControlValidator()
print("‚úÖ Quality Control Validator initialized")

# =============================================================================
# Main Execution Options for Phase 2
# =============================================================================

print("üöÄ Phase 2 Enhanced Workflow Ready!")
print("\nAvailable workflow types:")
print("  1. enhanced  - Complete workflow with PDB wizard + flexible receptors")
print("  2. flexible  - Flexible receptor workflow (no PDB wizard)")
print("  3. basic     - Basic workflow (no enhancements)")

print("\nExample usage:")
print("  # Complete enhanced workflow (recommended)")
print("  results = run_enhanced_workflow('enhanced', stages=['A', 'B'])")
print("")
print("  # Flexible receptor only")
print("  results = run_enhanced_workflow('flexible', stages=['A', 'B'])")
print("")
print("  # Basic workflow")
print("  results = run_enhanced_workflow('basic', stages=['A', 'B'])")

print("\nQuality control:")
print("  qc_validator.validate_input_files(pairlist_df)")
print("  qc_validator.validate_docking_results(results)")
print("  qc_validator.generate_quality_report()")

print("\nFlexible docking configuration:")
configure_flexible_docking_examples()


In [None]:
import os, glob, textwrap
root = '/content/drive/MyDrive/EFA_Docking'   # same value you used
print("Files under", root)
print(textwrap.fill("  ".join(sorted(os.listdir(root))), 100))


Files under /content/drive/MyDrive/EFA_Docking
gnina  gnina_out  ligands  ligands_prep  ligands_raw  pairlist.gsheet  pairlist1.csv  pairlist2.csv
pairlist3.csv  pairlist4.numbers  receptors  receptors_prep  receptors_raw


In [None]:
# fresh install of everything needed for ligand ‚Üí PDBQT
!pip install -q rdkit-pypi


[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m29.4/29.4 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip show rdkit-pypi | head -4
!pip show meeko      | head -4


Name: rdkit-pypi
Version: 2022.9.5
Summary: A collection of chemoinformatics and machine-learning software written in C++ and Python
Home-page: https://github.com/kuelumbus/rdkit-pypi
Name: meeko
Version: 0.6.1
Summary: Python package for preparing small molecule for docking
Home-page: https://github.com/ccsb-scripps/meeko
ERROR: Pipe to stdout was broken
Exception ignored in: <_io.TextIOWrapper name='<stdout>' mode='w' encoding='utf-8'>
BrokenPipeError: [Errno 32] Broken pipe


In [None]:
#!/usr/bin/env bash
set -euo pipefail

# ‚îÄ‚îÄ edit these three paths if you renamed your folders ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
ROOT="${1:-$(pwd)}"
RAW_LIG="$ROOT/ligands_raw"          # *.sdf or *.mol2
RAW_REC="$ROOT/receptors_raw"        # *.pdb
OUT_LIG="$ROOT/ligands_prep"         # <- output *.pdbqt
OUT_REC="$ROOT/receptors_prep"       # <- output *.pdbqt
mkdir -p "$OUT_LIG" "$OUT_REC"

# optional: PDB2PQR force-field (AMBER, PARSE, CHARMM, OPLS)
FF="AMBER"

shopt -s nullglob

echo "üß™  Ligand preparation -----------------------------------"
for mol in "$RAW_LIG"/*.{mol2,sdf}; do
  base=${mol##*/}; base=${base%.*}
  out="$OUT_LIG/${base}.pdbqt"
  [[ -f $out ]] && { echo "skip $base"; continue; }
  mk_prepare_ligand.py -i "$mol" -o "$out"
done

echo -e "\nüß¨  Receptor preparation (PDB ‚Üí PQR ‚Üí clean PDB) --------"
for pdb in "$RAW_REC"/*.pdb; do
  base=${pdb##*/}; base=${base%.*}
  pqr="$OUT_REC/${base}.pqr"
  clean_pdb="$OUT_REC/${base}_clean.pdb"
  pdbqt="$OUT_REC/${base}.pdbqt"

  [[ -f $pdbqt ]] && { echo "skip $base"; continue; }

  # 1) PDB ‚Üí PQR  (repairs and protonates)
  pdb2pqr30 --ff "$FF" --with-ph 7.4 "$pdb" "$pqr" >/dev/null

  # 2) PQR ‚Üí cleaned-PDB  (strip charges/radii)
  obabel "$pqr" -O "$clean_pdb"  >/dev/null

  # 3) PDB ‚Üí PDBQT via Meeko  (still keep guard flags)
  mk_prepare_receptor.py --read_pdb "$clean_pdb" \
                         -p "$pdbqt"              \
                         --allow_bad_res          \
                         --default_altloc A
done

echo -e "\n‚úÖ  Finished."
echo "Ligands   prepared: $(ls -1q $OUT_LIG/*.pdbqt  2>/dev/null | wc -l)"
echo "Receptors prepared: $(ls -1q $OUT_REC/*.pdbqt  2>/dev/null | wc -l)"


üß™  Ligand preparation -----------------------------------
Input molecules processed: 1, skipped: 0
PDBQT files written: 1
PDBQT files not written due to error: 0
Input molecules with errors: 0
Input molecules processed: 1, skipped: 0
PDBQT files written: 1
PDBQT files not written due to error: 0
Input molecules with errors: 0
Input molecules processed: 1, skipped: 0
PDBQT files written: 1
PDBQT files not written due to error: 0
Input molecules with errors: 0
Input molecules processed: 1, skipped: 0
PDBQT files written: 1
PDBQT files not written due to error: 0
Input molecules with errors: 0
Input molecules processed: 1, skipped: 0
PDBQT files written: 1
PDBQT files not written due to error: 0
Input molecules with errors: 0
Input molecules processed: 1, skipped: 0
PDBQT files written: 1
PDBQT files not written due to error: 0
Input molecules with errors: 0
Input molecules processed: 1, skipped: 0
PDBQT files written: 1
PDBQT files not written due to error: 0
Input molecules with erro

No template matched for residue_key='B:417'
tried 6 templates for residue_key='B:417'excess_H_ok=False
LYS        heavy_miss=4 heavy_excess=0 H_excess=[] bond_miss={4} bond_excess=set()
NLYS       heavy_miss=4 heavy_excess=0 H_excess=[] bond_miss=set() bond_excess=set()
CLYS       heavy_miss=5 heavy_excess=0 H_excess=[] bond_miss={5} bond_excess={1}
LYN        heavy_miss=4 heavy_excess=0 H_excess=[] bond_miss={4} bond_excess=set()
NLYN       heavy_miss=4 heavy_excess=0 H_excess=[] bond_miss=set() bond_excess=set()
CLYN       heavy_miss=5 heavy_excess=0 H_excess=[] bond_miss={5} bond_excess={1}

No template matched for residue_key='B:427'
tried 6 templates for residue_key='B:427'excess_H_ok=False
GLU        heavy_miss=4 heavy_excess=0 H_excess=[] bond_miss=set() bond_excess=set()
NGLU       heavy_miss=4 heavy_excess=0 H_excess=[] bond_miss=set() bond_excess={4}
CGLU       heavy_miss=5 heavy_excess=0 H_excess=[] bond_miss=set() bond_excess={1}
GLH        heavy_miss=4 heavy_excess=0 H_exc

CalledProcessError: Command 'b'set -e\n# \xe2\x94\x80\xe2\x94\x80 adjust these four folders if your names differ \xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\xe2\x94\x80\nROOT="/content/drive/MyDrive/EFA_Docking"\nRAW_LIG="$ROOT/ligands_raw"          # *.mol2  or *.sdf\nRAW_REC="$ROOT/receptors_raw"        # *.pdb\nOUT_LIG="$ROOT/ligands_prep"         # output *.pdbqt (ligands)\nOUT_REC="$ROOT/receptors_prep"       # output *.pdbqt (receptors)\nmkdir -p "$OUT_LIG" "$OUT_REC"\n\necho "\xf0\x9f\xa7\xaa  Ligand preparation -----------------------------------"\nshopt -s nullglob\nfor mol in "$RAW_LIG"/*.{mol2,sdf}; do\n  base=$(basename "${mol%.*}")\n  out="$OUT_LIG/${base}.pdbqt"\n  [[ -f "$out" ]] && { echo "skip $base"; continue; }\n  mk_prepare_ligand.py  -i "$mol"  -o "$out"\ndone\n\necho -e "\\n\xf0\x9f\xa7\xac  Receptor preparation -------------------------------"\nfor pdb in "$RAW_REC"/*.pdb; do\n  base=$(basename "${pdb%.*}")\n  out="$OUT_REC/${base}.pdbqt"\n  [[ -f "$out" ]] && { echo "skip $base"; continue; }\n  mk_prepare_receptor.py  --read_pdb "$pdb"  -p "$out"\ndone\n\necho -e "\\n\xe2\x9c\x85  Done."\necho   "Ligands   written: $(ls -1q $OUT_LIG/*.pdbqt 2>/dev/null | wc -l)"\necho   "Receptors written: $(ls -1q $OUT_REC/*.pdbqt 2>/dev/null | wc -l)"\n'' returned non-zero exit status 1.

In [5]:
# --------------------------------------------------------------
# Load CSV robustly + sanity checks
# --------------------------------------------------------------
pairs = pd.read_csv(f'{root}/pairlist.csv', skipinitialspace=True)
pairs.columns = [c.strip().lower() for c in pairs.columns]            # trim & lowercase

required = {'receptor','site_id','ligand',
            'center_x','center_y','center_z',
            'size_x','size_y','size_z'}
missing_cols = required.difference(pairs.columns)
assert not missing_cols, f"CSV is missing columns: {missing_cols}"

# check files exist
for rec in pairs['receptor'].unique():
    assert pathlib.Path(f"{rec_dir}/{rec}.pdbqt").is_file(), f"receptor file missing: {rec}"
for lig in pairs['ligand'].unique():
    assert pathlib.Path(f"{lig_dir}/{lig}.pdbqt").is_file(), f"ligand file missing: {lig}"
print("‚úì CSV headers OK and all receptor / ligand files found")


NameError: name 'pd' is not defined

In [None]:
# --------------------------------------------------------------
#  Batch GNINA docking  (CPU build, no --gpu flag)
# --------------------------------------------------------------
import os, re, pathlib, shlex, subprocess
import pandas as pd
from tqdm.auto import tqdm

root    = '/content/drive/MyDrive/Sertaline_Derv_docking'
rec_dir = f'{root}/receptors'
lig_dir = f'{root}/ligands'
out_dir = pathlib.Path(f'{root}/gnina_out'); out_dir.mkdir(exist_ok=True)

GNINA_BIN = pathlib.Path("/content/drive/MyDrive/Sertaline_Derv_docking/gnina")
assert GNINA_BIN.is_file(), "gnina binary not found!"

pairs = pd.read_csv(f'{root}/pairlist.csv', skipinitialspace=True)
pairs.columns = [c.strip().lower() for c in pairs.columns]

def clean_cell(x):
    s = str(x).strip().strip('"').strip("'")
    s = re.sub(r'\s+', '', s)  # remove inner spaces if any were accidental
    return s

# Clean up key columns
for col in ['receptor', 'ligand', 'site_id']:
    if col in pairs.columns:
        pairs[col] = pairs[col].map(clean_cell)

# Ensure numeric docking box
for col in ['center_x','center_y','center_z','size_x','size_y','size_z']:
    pairs[col] = pd.to_numeric(pairs[col], errors='raise')

# Build case-insensitive indices of existing files
def build_index(directory):
    return {fname.lower(): fname for fname in os.listdir(directory)}

rec_index = build_index(rec_dir)
lig_index = build_index(lig_dir)

def resolve_path(directory, name, ext='.pdbqt', index=None):
    """Return a real, existing path with exactly one ext; case-insensitive match."""
    base = clean_cell(name)
    cand = base if base.lower().endswith(ext) else base + ext
    if index is not None:
        real = index.get(cand.lower())
        if real:
            return os.path.join(directory, real)
    # Fallback: direct path (case-sensitive)
    path = os.path.join(directory, cand)
    return path if os.path.exists(path) else None

# Preflight: check that all receptor/ligand files exist after normalization
missing = []
for _, row in pairs.iterrows():
    rec_path = resolve_path(rec_dir, row['receptor'], index=rec_index)
    lig_path = resolve_path(lig_dir, row['ligand'],   index=lig_index)
    if not rec_path or not lig_path:
        missing.append({
            'receptor': row['receptor'], 'ligand': row['ligand'],
            'missing_receptor': not bool(rec_path), 'missing_ligand': not bool(lig_path)
        })

if missing:
    print("‚ö†Ô∏è Some files are missing after normalization/case-resolution:\n")
    for m in missing[:20]:
        print(m)
    print(f"\nTotal missing: {len(missing)}. Fix these names or files before docking.\n")

# ---- Docking (GPU/CPU logic optional; focus here is filename fix)
EXHAUSTIVENESS, NUM_MODES, SEED = 16, 20, 0
gpu_flag = ""  # or "--gpu" if you‚Äôre using the GPU version

def gnina_cmd(row):
    rec  = resolve_path(rec_dir, row['receptor'], index=rec_index)
    lig  = resolve_path(lig_dir, row['ligand'],   index=lig_index)
    assert rec and lig, f"Missing file(s): rec={row['receptor']} lig={row['ligand']}"
    tag  = f"{row['receptor']}_{row.get('site_id','NA')}_{row['ligand']}"
    pose = out_dir / f"{tag}_top.sdf"
    log  = out_dir / f"{tag}.log"
    return (
        f"{GNINA_BIN} {gpu_flag} "
        f"--receptor {shlex.quote(rec)} --ligand {shlex.quote(lig)} "
        f"--center_x {row['center_x']} --center_y {row['center_y']} --center_z {row['center_z']} "
        f"--size_x {row['size_x']} --size_y {row['size_y']} --size_z {row['size_z']} "
        f"--exhaustiveness {EXHAUSTIVENESS} --num_modes {NUM_MODES} --seed {SEED} "
        f"--cnn_scoring rescore --out {shlex.quote(str(pose))} --log {shlex.quote(str(log))}"
    ).strip()

failures = []
for _, row in tqdm(pairs.iterrows(), total=len(pairs), desc="Docking"):
    try:
        cmd = gnina_cmd(row)
        subprocess.run(shlex.split(cmd), check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except AssertionError as e:
        print(f"‚ö†Ô∏è  Skipping {row['receptor']}-{row.get('site_id','NA')}-{row['ligand']} :: {e}")
        failures.append(row)
    except subprocess.CalledProcessError as e:
        print(f"‚ö†Ô∏è  Dock failed for {row['receptor']}-{row.get('site_id','NA')}-{row['ligand']}")
        print(e.stderr.decode(errors="ignore")[:300], "‚Ä¶")
        failures.append(row)

print(f"‚úÖ  Docking finished: {len(pairs)-len(failures)} successes, {len(failures)} failures")



Docking:   0%|          | 0/125 [00:00<?, ?it/s]

In [6]:
# --------------------------------------------------------------
#  Batch GNINA docking  (prefers GPU; falls back to CPU)
# --------------------------------------------------------------
import os, pathlib, shlex, subprocess
import pandas as pd
from tqdm.auto import tqdm

# ---- Paths (adjust as needed)
root    = '/content/drive/MyDrive/Sertaline_Derv_docking'
rec_dir = f'{root}/receptors'
lig_dir = f'{root}/ligands'
out_dir = pathlib.Path(f'{root}/gnina_out'); out_dir.mkdir(exist_ok=True)

GNINA_BIN = pathlib.Path("/content/drive/MyDrive/Sertaline_Derv_docking/gnina")  # CUDA build preferred
assert GNINA_BIN.is_file(), "gnina binary not found!"

pairs = pd.read_csv(f'{root}/pairlist.csv', skipinitialspace=True)
pairs.columns = [c.strip().lower() for c in pairs.columns]

# ---- User knobs
PREFER_GPU      = True                 # try to use GPU if present & supported
SELECT_GPU_ID   = 0                    # which GPU to use (if multiple)
EXHAUSTIVENESS  = 16
NUM_MODES       = 20
SEED            = 0

# ---- Helpers
def have_nvidia_gpu() -> bool:
    # Colab-style quick checks
    if os.path.exists("/proc/driver/nvidia/version"):
        return True
    try:
        out = subprocess.run(["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
        return b"GPU" in out.stdout
    except Exception:
        return False

def gnina_supports_gpu() -> bool:
    """Best-effort check: run `gnina --help` and search for '--gpu' flag."""
    try:
        out = subprocess.run([str(GNINA_BIN), "--help"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=True)
        txt = out.stdout.decode(errors="ignore").lower()
        return "--gpu" in txt  # gnina CUDA builds expose this flag
    except Exception:
        return False

USE_GPU = PREFER_GPU and have_nvidia_gpu() and gnina_supports_gpu()

if USE_GPU:
    # Limit gnina to a specific GPU if requested
    os.environ["CUDA_VISIBLE_DEVICES"] = str(SELECT_GPU_ID)
    gpu_flag = "--gpu"
    print(f"‚úÖ Using GPU (CUDA_VISIBLE_DEVICES={os.environ['CUDA_VISIBLE_DEVICES']})")
else:
    gpu_flag = ""
    if PREFER_GPU:
        print("‚ö†Ô∏è GPU requested but not available or gnina not built with CUDA. Falling back to CPU.")
    else:
        print("‚ÑπÔ∏è Using CPU as requested.")

def gnina_cmd(row):
    rec  = f"{rec_dir}/{row['receptor']}.pdbqt"
    lig  = f"{lig_dir}/{row['ligand']}.pdbqt"
    tag  = f"{row['receptor']}_{row['site_id']}_{row['ligand']}"
    pose = out_dir / f"{tag}_top.sdf"
    log  = out_dir / f"{tag}.log"

    return (
        f"{GNINA_BIN} {gpu_flag} "
        f"--receptor {shlex.quote(rec)} --ligand {shlex.quote(lig)} "
        f"--center_x {row['center_x']} --center_y {row['center_y']} --center_z {row['center_z']} "
        f"--size_x {row['size_x']} --size_y {row['size_y']} --size_z {row['size_z']} "
        f"--exhaustiveness {EXHAUSTIVENESS} --num_modes {NUM_MODES} --seed {SEED} "
        f"--cnn_scoring rescore --out {shlex.quote(str(pose))} --log {shlex.quote(str(log))}"
    ).strip()

failures = []
for _, row in tqdm(pairs.iterrows(), total=len(pairs), desc="Docking"):
    cmd = gnina_cmd(row)
    try:
        # inherit env so CUDA_VISIBLE_DEVICES is respected
        res = subprocess.run(shlex.split(cmd), check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ)
    except subprocess.CalledProcessError as e:
        print(f"‚ö†Ô∏è  Dock failed for {row['receptor']}-{row['site_id']}-{row['ligand']}")
        print(e.stderr.decode(errors="ignore")[:300], "‚Ä¶")
        failures.append(row)

print(f"‚úÖ  Docking finished: {len(pairs)-len(failures)} successes, {len(failures)} failures")

# Optional: save failures for quick reruns
if failures:
    pd.DataFrame(failures).to_csv(out_dir / "failures.csv", index=False)
    print(f"üíæ Saved failure rows to {out_dir/'failures.csv'}")


‚ö†Ô∏è GPU requested but not available or gnina not built with CUDA. Falling back to CPU.


Docking:   0%|          | 0/15 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# ‚¨áÔ∏è  post_gnina.py ‚Äì parses logs, builds complexes, splits poses
import re, argparse, pathlib, csv
import pandas as pd
from tqdm.auto import tqdm
from rdkit import Chem
from rdkit.Chem import AllChem
import openbabel as ob

def parse_log(log_path):
    tag = log_path.stem
    rows = []
    with open(log_path) as fh:
        for ln in fh:
            if ln.startswith("CNNaffinity"):
                rows.append(
                    dict(tag=tag,
                         cnn_aff=float(ln.split()[1]),
                         cnn_score=float(ln.split()[3].strip('()'))))
    return rows

def make_complexes(pose_sdf, rec_pdbqt, out_multi, out_dir):
    conv = ob.OBConversion(); conv.SetInAndOutFormats("pdbqt", "pdb")
    recmol = ob.OBMol();  conv.ReadFile(recmol, str(rec_pdbqt))
    rec_pdb = rec_pdbqt.with_suffix(".pdb")
    conv.WriteFile(recmol, str(rec_pdb))

    suppl = Chem.SDMolSupplier(str(pose_sdf), removeHs=False)
    out_dir.mkdir(exist_ok=True)
    with open(out_multi, "w") as big:
        for i, lig in enumerate(suppl, 1):
            with open(rec_pdb) as fh: big.write(fh.read())
            big.write(Chem.MolToPDBBlock(lig));  big.write("ENDMDL\n")
            split = out_dir / f"{pose_sdf.stem}_pose{i:02d}.pdb"
            with open(split, "w") as sp:
                with open(rec_pdb) as fh: sp.write(fh.read())
                sp.write(Chem.MolToPDBBlock(lig))
    rec_pdb.unlink()

def post_gnina(gnina_out, rec_prep):
    gnina_out = pathlib.Path(gnina_out)
    rec_prep  = pathlib.Path(rec_prep)

    # 1 logs ‚Üí CSV
    rows = []
    for log in tqdm(gnina_out.glob("*.log"), desc="parse logs"):
        rows += parse_log(log)
    pd.DataFrame(rows).to_csv(gnina_out/"all_scores.csv", index=False)

    # 2 + 3 complex build & split
    cmp_dir = gnina_out/"complexes"; cmp_dir.mkdir(exist_ok=True)
    for sdf in tqdm(gnina_out.glob("*_top.sdf"), desc="make complexes"):
        rec = rec_prep/f"{sdf.stem.split('_')[0]}.pdbqt"
        if not rec.exists():
            print("skip, receptor not found:", rec)
            continue
        make_complexes(sdf, rec,
                       out_multi = cmp_dir/f"{sdf.stem}_complexes.pdb",
                       out_dir   = cmp_dir)

# ------- call the helper --------------------------------------
GNINA_OUT   = "/content/drive/MyDrive/EFA_Docking/gnina_out"     # ‚Üê adjust
RECEPTOR_PREP = "/content/drive/MyDrive/EFA_Docking/receptors"
post_gnina(GNINA_OUT, RECEPTOR_PREP)


ModuleNotFoundError: No module named 'rdkit'