In [8]:
#!/usr/bin/env python3
"""
FoldX Solubility Assessment Script for Frankie's Pipeline Outputs
"""

import pandas as pd
import os
import subprocess
import sys
from pathlib import Path

def read_experiments_excel(filepath):
    """Read experiment data from Excel file"""
    df = pd.read_excel(filepath, header=None)
    
    # Extract experiment names (A13:A42) and PDB files (E13:E42)
    experiments = []
    for row in range(12, 42):  # 0-indexed, so row 13 = index 12
        exp_name = df.iloc[row, 0] if pd.notna(df.iloc[row, 0]) else None
        pdb_file = df.iloc[row, 4] if pd.notna(df.iloc[row, 4]) else None  # Column E = index 4
        
        if exp_name and pdb_file:
            experiments.append({
                'experiment_name': str(exp_name).strip(),
                'pdb_file': str(pdb_file).strip()
            })
    
    return experiments

def find_pdb_file(pdb_filename, experiment_name, experiments_dir):
    """Find PDB file in the specific experiment's directory structure"""
    experiments_path = Path(experiments_dir)
    
    # Look for the PDB file in the expected path structure
    expected_path = experiments_path / experiment_name / "4_docking" / "output" / "08_mdscoring" / pdb_filename
    
    if expected_path.exists():
        return str(expected_path)
    
    # Fallback: search in the experiment's directory recursively
    exp_dir = experiments_path / experiment_name
    if exp_dir.exists():
        for pdb_path in exp_dir.rglob(pdb_filename):
            return str(pdb_path)
    
    return None

def run_foldx_analysis(pdb_path, experiment_name, output_dir, foldx_path="/home/nicholas/Downloads/foldx_20251231"):
    """Run FoldX analysis on a PDB file"""
    pdb_file = Path(pdb_path)
    pdb_name = pdb_file.stem
    
    # Create output directory for this analysis using experiment name
    analysis_dir = Path(output_dir) / f"{experiment_name}_analysis"
    analysis_dir.mkdir(parents=True, exist_ok=True)
    
    # Copy PDB to analysis directory (FoldX works better with local files)
    local_pdb = analysis_dir / pdb_file.name
    subprocess.run(['cp', str(pdb_path), str(local_pdb)], check=True)
    
    results = {}
    
    # Change to analysis directory
    original_dir = os.getcwd()
    os.chdir(analysis_dir)
    
    try:
        # 1. Stability analysis
        print(f"Running Stability analysis for {pdb_name}...")
        cmd = [foldx_path, '--command=Stability', f'--pdb={pdb_file.name}']
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode == 0:
            results['stability'] = 'completed'
        else:
            results['stability'] = f'error: {result.stderr}'
        
        # 2. AnalyseComplex for solvent accessibility
        print(f"Running AnalyseComplex for {pdb_name}...")
        cmd = [foldx_path, '--command=AnalyseComplex', f'--pdb={pdb_file.name}']
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode == 0:
            results['analyse_complex'] = 'completed'
        else:
            results['analyse_complex'] = f'error: {result.stderr}'
        
        # 3. SequenceDetail for per-residue analysis
        print(f"Running SequenceDetail for {pdb_name}...")
        cmd = [foldx_path, '--command=SequenceDetail', f'--pdb={pdb_file.name}']
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode == 0:
            results['sequence_detail'] = 'completed'
        else:
            results['sequence_detail'] = f'error: {result.stderr}'
        
    finally:
        os.chdir(original_dir)
    
    return results

def parse_foldx_results(analysis_dir):
    """Parse FoldX output files for solubility metrics"""
    analysis_path = Path(analysis_dir)
    metrics = {}
    
    # Parse Stability output
    stability_files = list(analysis_path.glob("*_0_ST.fxout"))
    if stability_files:
        with open(stability_files[0], 'r') as f:
            lines = f.readlines()
            for line in lines:
                if line.strip() and not line.startswith('#'):
                    parts = line.strip().split('\t')
                    if len(parts) >= 2:
                        metrics['total_energy'] = float(parts[1])
                        break
    
    # Parse AnalyseComplex output
    ac_files = list(analysis_path.glob("*_AC.fxout"))
    if ac_files:
        with open(ac_files[0], 'r') as f:
            lines = f.readlines()
            for line in lines:
                if 'Total' in line and 'Energy' in line:
                    parts = line.strip().split('\t')
                    # Extract relevant solubility metrics
                    # This will need adjustment based on actual FoldX output format
                    pass
    
    return metrics

def main():
    # Configuration
    excel_file = "paper/experiments.xlsx"
    experiments_dir = "experiments"
    output_dir = "foldx_results"
    foldx_path = "/home/nicholas/Downloads/foldx_20251231"  # Adjust path to your FoldX executable
    
    # Create output directory
    Path(output_dir).mkdir(exist_ok=True)
    
    # Read experiment data
    print("Reading experiment data from Excel file...")
    experiments = read_experiments_excel(excel_file)
    print(f"Found {len(experiments)} experiments to process")
    
    # Process each experiment
    results_summary = []
    
    for i, exp in enumerate(experiments, 1):
        print(f"\n[{i}/{len(experiments)}] Processing {exp['experiment_name']}...")
        
        # Find PDB file
        pdb_path = find_pdb_file(exp['pdb_file'], exp['experiment_name'], experiments_dir)
        
        if not pdb_path:
            print(f"  WARNING: PDB file {exp['pdb_file']} not found")
            results_summary.append({
                'experiment_name': exp['experiment_name'],
                'pdb_file': exp['pdb_file'],
                'status': 'PDB not found',
                'total_energy': None,
                'solubility_score': None
            })
            continue
        
        print(f"  Found PDB: {pdb_path}")
        
        # Run FoldX analysis
        try:
            foldx_results = run_foldx_analysis(pdb_path, exp['experiment_name'], output_dir, foldx_path)
            
            # Parse results for solubility metrics
            analysis_dir = Path(output_dir) / f"{exp['experiment_name']}_analysis"
            metrics = parse_foldx_results(analysis_dir)
            
            results_summary.append({
                'experiment_name': exp['experiment_name'],
                'pdb_file': exp['pdb_file'],
                'status': 'completed',
                'foldx_results': foldx_results,
                **metrics
            })
            
        except Exception as e:
            print(f"  ERROR: {str(e)}")
            results_summary.append({
                'experiment_name': exp['experiment_name'],
                'pdb_file': exp['pdb_file'],
                'status': f'error: {str(e)}',
                'total_energy': None,
                'solubility_score': None
            })
    
    # Save results summary
    results_df = pd.DataFrame(results_summary)
    results_df.to_csv(f"{output_dir}/solubility_assessment_summary.csv", index=False)
    results_df.to_excel(f"{output_dir}/solubility_assessment_summary.xlsx", index=False)
    
    print(f"\nAnalysis complete! Results saved to {output_dir}/")
    print("\nSummary:")
    print(f"  Total experiments: {len(experiments)}")
    print(f"  Completed: {len([r for r in results_summary if r['status'] == 'completed'])}")
    print(f"  Errors: {len([r for r in results_summary if 'error' in r['status']])}")
    print(f"  PDB not found: {len([r for r in results_summary if r['status'] == 'PDB not found'])}")

if __name__ == "__main__":
    main()

Reading experiment data from Excel file...
Found 30 experiments to process

[1/30] Processing kinetic-template...
  Found PDB: experiments/kinetic-template/4_docking/output/08_mdscoring/mdscoring_2.pdb
Running Stability analysis for mdscoring_2...
Running AnalyseComplex for mdscoring_2...
Running SequenceDetail for mdscoring_2...

[2/30] Processing partial-lagoon...
  Found PDB: experiments/partial-lagoon/4_docking/output/08_mdscoring/mdscoring_2.pdb
Running Stability analysis for mdscoring_2...
Running AnalyseComplex for mdscoring_2...
Running SequenceDetail for mdscoring_2...

[3/30] Processing glowing-avocet...
  Found PDB: experiments/glowing-avocet/4_docking/output/08_mdscoring/mdscoring_5.pdb
Running Stability analysis for mdscoring_5...
Running AnalyseComplex for mdscoring_5...
Running SequenceDetail for mdscoring_5...

[4/30] Processing approximate-entrepreneur...
  Found PDB: experiments/approximate-entrepreneur/4_docking/output/08_mdscoring/mdscoring_1.pdb
Running Stability a