In [7]:
import csv

file_path = r"C:\Users\rfrjo\Documents\Codebases\PFP_Testing\data\protein_matching_results_counts_updated.tsv"

with open(file_path, newline='') as tsvfile:
    reader = csv.DictReader(tsvfile, delimiter='\t')
    for row in reader:
        if row['status'].strip() != 'Success':
            print(row)

{'original_id': '1W8X-M', 'matched_id': '1w8x_M', 'similarity_score': '', 'notes': 'Stage 1', 'n_a3m_files': '3', 'n_rows_dropped': '0', 'hhfilter_rows': '1', 'diversity_filtered_rows': '', 'status': 'Failed: only 1 rows after stripping'}
{'original_id': '2WWX-B', 'matched_id': '2wwx_B', 'similarity_score': '', 'notes': 'Stage 1', 'n_a3m_files': '3', 'n_rows_dropped': '0', 'hhfilter_rows': '1', 'diversity_filtered_rows': '', 'status': 'Failed: only 1 rows after stripping'}
{'original_id': '1W8X-M', 'matched_id': '1w8x_M', 'similarity_score': '', 'notes': 'Stage 1', 'n_a3m_files': '3', 'n_rows_dropped': '0', 'hhfilter_rows': '1', 'diversity_filtered_rows': '', 'status': 'Failed: only 1 rows after stripping'}
{'original_id': '2WWX-B', 'matched_id': '2wwx_B', 'similarity_score': '', 'notes': 'Stage 1', 'n_a3m_files': '3', 'n_rows_dropped': '0', 'hhfilter_rows': '1', 'diversity_filtered_rows': '', 'status': 'Failed: only 1 rows after stripping'}
{'original_id': '1W8X-M', 'matched_id': '1w8

In [11]:
#!/usr/bin/env python
"""
retry_failed_msa_update.py - Re-process failed MSA downloads with case-sensitive fix
Updates the TSV file in-place instead of appending duplicates
"""

import csv
import shutil
import subprocess
import numpy as np
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, List, Tuple
from tqdm.auto import tqdm
from scipy.spatial.distance import pdist, squareform
from datetime import datetime

import boto3
from boto3.s3.transfer import TransferConfig
from botocore.config import Config
from botocore import UNSIGNED
import botocore.exceptions

# ─── Configuration ───────────────────────────────────────────────────────
ROOT = Path(r"C:\Users\rfrjo\Documents\Codebases\PFP_Testing")
TSV_FILE = ROOT / "data" / "protein_matching_results_counts_updated.tsv"
PROTEIN_DIR = ROOT / "data" / "protein_data_pdb"
MAX_ROWS = 256
DL_CONCURRENCY = 8
HH_PARALLEL = 6

# S3 setup
BUCKET = "openfold"
A3M_FILES = ("bfd_uniclust_hits.a3m", "mgnify_hits.a3m", "uniref90_hits.a3m")
transfer_cfg = TransferConfig(max_concurrency=DL_CONCURRENCY)
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

# ─── Helper Functions (same as before) ───────────────────────────────────
def to_wsl(p: Path) -> str:
    """Convert Windows path to WSL path"""
    posix = p.resolve().as_posix()
    return posix if posix.startswith("/mnt/") else f"/mnt/{p.drive[0].lower()}{posix[2:]}"

def strip_insertions_a3m(seq: str) -> str:
    """Remove insertions from A3M sequences"""
    return seq.translate({ord(c): None for c in 'abcdefghijklmnopqrstuvwxyz.*'})

def load_msa_from_a3m(path: Path):
    """Load MSA from A3M file"""
    msa, hdr, seq = [], None, []
    with path.open() as fh:
        for line in fh:
            line = line.rstrip()
            if line.startswith('>'):
                if hdr is not None:
                    msa.append((hdr, ''.join(seq)))
                hdr, seq = line[1:], []
            else:
                seq.append(line)
        if hdr is not None:
            msa.append((hdr, ''.join(seq)))
    return msa

def diversity_max_subsample(msa, k):
    """Diversity maximizing subsample"""
    if len(msa) <= k:
        return msa
    seqs = np.array([list(s) for _, s in msa], dtype='U1')
    uniq = {aa: i for i, aa in enumerate(sorted({c for row in seqs for c in row}))}
    arr = np.vectorize(uniq.get)(seqs)
    dist = squareform(pdist(arr, metric='hamming'))
    keep = [0]
    selected = np.zeros(len(msa), bool)
    selected[0] = True
    while selected.sum() < k:
        mean = dist[:, selected].mean(1)
        mean[selected] = -1
        idx = int(mean.argmax())
        if mean[idx] <= 0:
            break
        selected[idx] = True
        keep.append(idx)
    return [msa[i] for i in keep]

# ─── Case-Sensitive Download Function ────────────────────────────────────
def download_with_case_fix(pdb: str, chain: str, fname: str, dest: Path) -> bool:
    """
    Try different case combinations for the chain letter.
    The key insight: OpenFold stores files with the exact case from the PDB ID.
    """
    base = f"pdb/{pdb.lower()}_"
    
    # Try these in order of likelihood
    candidates = [
        f"{base}{chain.lower()}/a3m/{fname}",  # Most common: lowercase chain
        f"{base}{chain.upper()}/a3m/{fname}",  # Sometimes uppercase
        f"{base}{chain}/a3m/{fname}",          # Original case (if mixed)
    ]
    
    for i, key in enumerate(candidates):
        try:
            dest.parent.mkdir(parents=True, exist_ok=True)
            s3.download_file(BUCKET, key, str(dest), Config=transfer_cfg)
            if i > 0:  # Only print if not the first attempt
                print(f"   → Found {fname} using case variant #{i+1}: {key}")
            return True
        except botocore.exceptions.ClientError as e:
            if e.response["Error"]["Code"] == "404":
                continue  # Try next candidate
            # For other errors (network, permissions), propagate
            raise
    return False  # All candidates failed

# ─── Process Chain Function ──────────────────────────────────────────────
def process_chain_fixed(chain_folder: Path, row_data: Dict[str, str]) -> Dict[str, str]:
    """Process a single chain with fixed case handling"""
    try:
        # Parse the matched_id - preserve the original case!
        matched_id = row_data['matched_id']
        if '_' not in matched_id:
            return {**row_data, **{
                'n_rows_dropped': '',
                'hhfilter_rows': '',
                'diversity_filtered_rows': '',
                'status': 'Failed: invalid matched_id format'
            }}
        
        parts = matched_id.split('_')
        pdb = parts[0]
        chain = parts[1]  # Keep original case!
        
        a3m_dir = chain_folder / "_tmp_a3m"
        
        # Download with case fix
        downloaded = []
        download_count = 0
        for fname in A3M_FILES:
            dest = a3m_dir / fname
            if download_with_case_fix(pdb, chain, fname, dest):
                downloaded.append(dest)
                download_count += 1
        
        if not downloaded:
            return {**row_data, **{
                'n_rows_dropped': '',
                'hhfilter_rows': '',
                'diversity_filtered_rows': '',
                'status': 'Failed: no A3M found (all cases tried)'
            }}
        
        # Concatenate
        raw_path = a3m_dir / "concat_raw.a3m"
        with raw_path.open('w') as out:
            for f in downloaded:
                out.write(f.read_text())
        
        # HHfilter
        hh_out = a3m_dir / f"hhfiltered_{MAX_ROWS}.a3m"
        result = subprocess.run([
            "wsl", "hhfilter", "-i", to_wsl(raw_path), "-o", to_wsl(hh_out),
            "-diff", str(MAX_ROWS)
        ], capture_output=True, text=True)
        
        if result.returncode != 0:
            return {**row_data, **{
                'n_rows_dropped': '',
                'hhfilter_rows': '',
                'diversity_filtered_rows': '',
                'status': f'Failed: hhfilter error: {result.stderr[:100]}'
            }}
        
        # Load and process MSA
        msa = load_msa_from_a3m(hh_out)
        hh_rows = len(msa)
        if not msa:
            return {**row_data, **{
                'n_rows_dropped': '',
                'hhfilter_rows': str(hh_rows),
                'diversity_filtered_rows': '',
                'status': 'Failed: hhfilter empty'
            }}
        
        # Strip insertions
        target_len = len(strip_insertions_a3m(msa[0][1]))
        kept, dropped = [], 0
        for hdr, seq in msa:
            clean = strip_insertions_a3m(seq)
            if len(clean) == target_len:
                kept.append((hdr, clean))
            else:
                dropped += 1
        
        if len(kept) < 2:
            return {**row_data, **{
                'n_rows_dropped': str(dropped),
                'hhfilter_rows': str(hh_rows),
                'diversity_filtered_rows': '',
                'status': f'Failed: only {len(kept)} rows after stripping'
            }}
        
        # Diversity filtering if needed
        rows_before_div = len(kept)
        div_filtered = 0
        if rows_before_div > MAX_ROWS:
            kept = diversity_max_subsample(kept, MAX_ROWS)
            div_filtered = rows_before_div - len(kept)
        
        # Save final MSA
        final_path = chain_folder / "final_filtered_256_stripped.a3m"
        with final_path.open('w') as fh:
            for hdr, seq in kept:
                fh.write(f">{hdr}\n{seq}\n")
        
        return {**row_data, **{
            'n_rows_dropped': str(dropped),
            'hhfilter_rows': str(hh_rows),
            'diversity_filtered_rows': str(div_filtered),
            'status': 'Success',
            '_download_count': str(download_count)  # Internal use only
        }}
    
    except Exception as e:
        return {**row_data, **{
            'n_rows_dropped': '',
            'hhfilter_rows': '',
            'diversity_filtered_rows': '',
            'status': f'Failed: {str(e)[:100]}'
        }}
    
    finally:
        # Cleanup
        a3m_dir = chain_folder / "_tmp_a3m"
        if a3m_dir.exists():
            shutil.rmtree(a3m_dir, ignore_errors=True)

# ─── Main Processing ─────────────────────────────────────────────────────
def main():
    start_time = datetime.now()
    print("🔄 Retry Failed MSA Processing with Case Fix")
    print("=" * 60)
    print(f"Started at: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
    
    # Check hhfilter
    if subprocess.run(["wsl", "which", "hhfilter"], capture_output=True).returncode:
        raise RuntimeError("hhfilter not found in WSL - please install HH-suite")
    
    # Read current TSV
    print("📖 Reading TSV file...")
    with TSV_FILE.open('r', newline='') as fh:
        reader = csv.DictReader(fh, delimiter='\t')
        fieldnames = reader.fieldnames
        all_rows = list(reader)
    
    print(f"   Total rows: {len(all_rows)}")
    
    # Find failed rows (excluding permanent failures)
    PERMANENT_FAILURES = {'Failed: mapping missing', 'Failed: invalid chain ID'}
    failed_rows = []
    failed_indices = []
    
    for i, row in enumerate(all_rows):
        status = row.get('status', '')
        if status != 'Success' and not any(perm in status for perm in PERMANENT_FAILURES):
            failed_rows.append(row)
            failed_indices.append(i)
    
    if not failed_rows:
        print("✅ No retryable failures found!")
        return
    
    print(f"   Found {len(failed_rows)} retryable failures")
    
    # Show some examples
    print("\n📋 Sample failures to retry:")
    for row in failed_rows[:5]:
        print(f"   {row['original_id']} -> {row['matched_id']} : {row['status']}")
    if len(failed_rows) > 5:
        print(f"   ... and {len(failed_rows) - 5} more")
    
    # Count failure types
    failure_types = {}
    for row in failed_rows:
        status = row['status']
        failure_types[status] = failure_types.get(status, 0) + 1
    
    print("\n📊 Failure breakdown:")
    for status, count in sorted(failure_types.items(), key=lambda x: -x[1])[:5]:
        print(f"   {count:3d} proteins: {status}")
    
    # Process in parallel
    print(f"\n🚀 Processing {len(failed_rows)} proteins...")
    print(f"   Workers: {HH_PARALLEL}")
    print(f"   Note: Output may appear interleaved due to parallel execution")
    
    updated_rows = {}  # Use dict to maintain order
    
    with ThreadPoolExecutor(max_workers=HH_PARALLEL) as pool:
        # Submit all jobs
        futures = {}
        for i, row in enumerate(failed_rows):
            chain_folder = PROTEIN_DIR / row['original_id']
            future = pool.submit(process_chain_fixed, chain_folder, row)
            futures[future] = (i, row)  # Track index for ordering
        
        # Process results with progress bar
        print("\n" + "="*80)
        with tqdm(total=len(futures), desc="Processing", unit="protein") as pbar:
            for future in as_completed(futures):
                idx, original_row = futures[future]
                result = future.result()
                updated_rows[idx] = result  # Store with original index
                
                # Print detailed status for each protein
                old_status = original_row['status']
                new_status = result['status']
                protein_id = result['original_id']
                matched_id = result['matched_id']
                
                if new_status == 'Success':
                    download_count = result.get('_download_count', '?')
                    timestamp = datetime.now().strftime("%H:%M:%S")
                    print(f"[{timestamp}] ✅ {protein_id} ({matched_id}): FIXED! Was: '{old_status}' → Now: Success")
                    print(f"   → Downloaded {download_count}/3 A3M files")
                    print(f"   → Sequences: {result['hhfilter_rows']} → {int(result['hhfilter_rows']) - int(result['n_rows_dropped'] or 0)} after stripping")
                    if result['diversity_filtered_rows'] and int(result['diversity_filtered_rows']) > 0:
                        print(f"   → Diversity filtered: {result['diversity_filtered_rows']} sequences")
                    pbar.set_postfix_str(f"✓ {protein_id}")
                else:
                    timestamp = datetime.now().strftime("%H:%M:%S")
                    print(f"[{timestamp}] ❌ {protein_id} ({matched_id}): STILL FAILED")
                    print(f"   Old: '{old_status}'")
                    print(f"   New: '{new_status}'")
                    pbar.set_postfix_str(f"✗ {protein_id}")
                
                pbar.update(1)
        print("="*80)
    
    # Sort results back to original order
    sorted_updated_rows = [updated_rows[i] for i in sorted(updated_rows.keys())]
    
    # Update the original rows
    print("\n📝 Updating TSV file...")
    success_count = 0
    for updated_row, idx in zip(sorted_updated_rows, failed_indices):
        # Remove internal fields before saving (make a copy)
        clean_row = {k: v for k, v in updated_row.items() if not k.startswith('_')}
        all_rows[idx] = clean_row
        if clean_row['status'] == 'Success':
            success_count += 1
    
    # Write updated TSV (backup original first)
    backup_path = TSV_FILE.with_suffix('.tsv.bak')
    shutil.copy2(TSV_FILE, backup_path)
    print(f"   Backed up to: {backup_path}")
    
    with TSV_FILE.open('w', newline='') as fh:
        writer = csv.DictWriter(fh, fieldnames=fieldnames, delimiter='\t')
        writer.writeheader()
        writer.writerows(all_rows)
    
    print(f"   Updated TSV saved!")
    
    # Summary
    print("\n📊 Summary:")
    print(f"   Total retried: {len(failed_rows)}")
    print(f"   Now successful: {success_count}")
    print(f"   Still failed: {len(failed_rows) - success_count}")
    print(f"   Success rate: {success_count/len(failed_rows)*100:.1f}%")
    
    # Show remaining failures
    still_failed = [r for r in sorted_updated_rows if r['status'] != 'Success']
    if still_failed:
        print("\n⚠️  Remaining failures:")
        failure_types = {}
        for row in still_failed:
            status = row['status']
            failure_types[status] = failure_types.get(status, 0) + 1
        
        for status, count in sorted(failure_types.items(), key=lambda x: -x[1]):
            print(f"   {count:3d} proteins: {status}")
    
    # Print timing
    end_time = datetime.now()
    duration = end_time - start_time
    print(f"\n⏱️  Total time: {duration}")
    print(f"Completed at: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")

if __name__ == "__main__":
    main()

🔄 Retry Failed MSA Processing with Case Fix
Started at: 2025-06-13 11:11:51
📖 Reading TSV file...
   Total rows: 29744
   Found 327 retryable failures

📋 Sample failures to retry:
   1W8X-M -> 1w8x_M : Failed: only 1 rows after stripping
   2WWX-B -> 2wwx_B : Failed: only 1 rows after stripping
   1W8X-M -> 1w8x_M : Failed: only 1 rows after stripping
   2WWX-B -> 2wwx_B : Failed: only 1 rows after stripping
   1W8X-M -> 1w8x_M : Failed: only 1 rows after stripping
   ... and 322 more

📊 Failure breakdown:
   318 proteins: Failed: no A3M found
     9 proteins: Failed: only 1 rows after stripping

🚀 Processing 327 proteins...
   Workers: 6
   Note: Output may appear interleaved due to parallel execution



Processing:   0%|          | 0/327 [00:00<?, ?protein/s]

   → Found bfd_uniclust_hits.a3m using case variant #2: pdb/1w8x_M/a3m/bfd_uniclust_hits.a3m
   → Found bfd_uniclust_hits.a3m using case variant #2: pdb/1w8x_M/a3m/bfd_uniclust_hits.a3m
   → Found bfd_uniclust_hits.a3m using case variant #2: pdb/2wwx_B/a3m/bfd_uniclust_hits.a3m
   → Found bfd_uniclust_hits.a3m using case variant #2: pdb/1w8x_M/a3m/bfd_uniclust_hits.a3m
[11:11:59] ❌ 2WWX-B (2wwx_B): STILL FAILED
   Old: 'Failed: only 1 rows after stripping'
   New: 'Failed: [WinError 183] Cannot create a file when that file already exists: 'C:\\Users\\rfrjo\\Documents\\Cod'
[11:11:59] ❌ 2WWX-B (2wwx_B): STILL FAILED
   Old: 'Failed: only 1 rows after stripping'
   New: 'Failed: [WinError 183] Cannot create a file when that file already exists: 'C:\\Users\\rfrjo\\Documents\\Cod'
   → Found mgnify_hits.a3m using case variant #2: pdb/1w8x_M/a3m/mgnify_hits.a3m
   → Found mgnify_hits.a3m using case variant #2: pdb/1w8x_M/a3m/mgnify_hits.a3m
   → Found mgnify_hits.a3m using case variant #2: 

In [2]:
# ===== INPUT CONFIGURATION =====
# Change this protein ID to process different proteins
protein_input = "5gup_6"  # Example: "3j9m_m", "5opt_h", etc.

#!/usr/bin/env python3
"""
Simple OpenFold MSA Processor - Jupyter Notebook Version
Just change the protein_input variable above and run all cells!
"""

import sys
import boto3
from botocore.config import Config
from botocore import UNSIGNED
from pathlib import Path
import subprocess
import numpy as np
from scipy.spatial.distance import pdist, squareform

# ===== CONFIGURATION =====
SAVE_ROOT = Path(r"C:\Users\rfrjo\Documents\Codebases\PFP_Testing\data\openfold_msas")
MAX_ROWS = 256
GENERATE_ESM_EMBED = False  # Set to True if you want ESM embeddings

# ===== HELPER FUNCTIONS =====
def to_wsl(p: Path) -> str:
    """Convert Windows path to WSL path"""
    p = Path(p).resolve()
    posix = p.as_posix()
    if posix.startswith("/mnt/"):
        return posix
    return f"/mnt/{p.drive[0].lower()}{posix[2:]}"

def strip_insertions_a3m(seq: str) -> str:
    """Remove insertions from A3M format sequences"""
    cleaned = ""
    for char in seq:
        if char.isupper() or char == '-':
            cleaned += char
        elif char.islower() or char == '.' or char == '*':
            continue
        else:
            print(f"Warning: unexpected character '{char}' in sequence")
    return cleaned

def load_msa_from_a3m(a3m_path, max_sequences=None):
    """Load MSA from A3M file"""
    msa_sequences = []
    with open(a3m_path) as f:
        description = None
        sequence = ""
        
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if description is not None:
                    msa_sequences.append((description, sequence))
                    if max_sequences and len(msa_sequences) >= max_sequences:
                        break
                description = line[1:]
                sequence = ""
            else:
                sequence += line
        
        if description is not None:
            msa_sequences.append((description, sequence))
    
    return msa_sequences

def diversity_maximizing_subsample(sequences, target_count):
    """Greedy Max-Hamming subsampling"""
    n = len(sequences)
    if n <= target_count:
        return sequences
    
    # Convert sequences to numeric array
    amino_acids = set()
    for _, seq in sequences:
        amino_acids.update(seq)
    
    aa_to_num = {aa: i for i, aa in enumerate(sorted(amino_acids))}
    
    seq_length = len(sequences[0][1])
    arr = np.zeros((n, seq_length), dtype=np.int8)
    
    for i, (_, seq) in enumerate(sequences):
        for j, aa in enumerate(seq):
            arr[i, j] = aa_to_num[aa]
    
    # Pair-wise Hamming distances
    dmat = squareform(pdist(arr, metric="hamming"))
    
    # Greedy selection
    selected_mask = np.zeros(n, dtype=bool)
    selected_mask[0] = True
    selected = [0]
    
    while selected_mask.sum() < target_count:
        avg = dmat[:, selected_mask].mean(axis=1)
        avg[selected_mask] = -1
        idx = int(avg.argmax())
        if avg[idx] <= 0:
            break
        selected_mask[idx] = True
        selected.append(idx)
    
    return [sequences[i] for i in selected]

# ===== MAIN PROCESSING FUNCTION =====
def process_protein(protein_id):
    """Process a single protein ID"""
    
    # Parse the protein ID - expecting format like "3j9m_m" or "5opt_h"
    if '_' not in protein_id:
        raise ValueError(f"Invalid protein ID format: {protein_id}. Expected format: pdbid_chain (e.g., 3j9m_m)")
    
    parts = protein_id.split('_')
    if len(parts) != 2:
        raise ValueError(f"Invalid protein ID format: {protein_id}. Expected format: pdbid_chain (e.g., 3j9m_m)")
    
    pdb_id = parts[0].lower()  # PDB ID should be lowercase
    chain_id = parts[1]  # Keep chain ID as-is (preserve case!)
    
    print(f"\n🧬 Processing protein: {protein_id}")
    print(f"   PDB ID: {pdb_id}")
    print(f"   Chain: {chain_id}")
    
    # Create output directory
    output_dir = SAVE_ROOT / f"{pdb_id.upper()}-{chain_id.upper()}"
    output_dir.mkdir(parents=True, exist_ok=True)
    print(f"   Output directory: {output_dir}")
    
    # Setup S3 client
    s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
    bucket = "openfold"
    
    # Download MSA files - IMPORTANT: Use the exact case from protein_id!
    prefix = f"pdb/{pdb_id}_{chain_id}/a3m/"  # This preserves the original case
    
    print(f"\n📥 Downloading MSA files from S3...")
    print(f"   S3 prefix: {prefix}")
    
    msa_files = []
    for fname in ("bfd_uniclust_hits.a3m", "mgnify_hits.a3m", "uniref90_hits.a3m"):
        key = prefix + fname
        dest = output_dir / fname
        try:
            print(f"   Downloading {key}...", end="")
            s3.download_file(bucket, key, str(dest))
            msa_files.append(dest)
            print(" ✓")
        except Exception as e:
            print(f" ✗ ({str(e)[:50]}...)")
    
    if not msa_files:
        raise RuntimeError(f"No MSA files found for {protein_id}")
    
    print(f"   Successfully downloaded {len(msa_files)} MSA files")
    
    # Concatenate MSA files
    raw_cat = output_dir / "all_raw.a3m"
    print(f"\n📋 Concatenating MSA files...")
    with open(raw_cat, "w") as out:
        for f in msa_files:
            out.write(f.read_text())
    print(f"   ✓ Concatenated MSA saved to: {raw_cat}")
    
    # Filter with hhfilter
    filtered = output_dir / f"all_hhfiltered_{MAX_ROWS}.a3m"
    print(f"\n🔧 Running hhfilter...")
    
    # Check if hhfilter is available
    check_hhfilter = subprocess.run(["wsl", "which", "hhfilter"], capture_output=True, text=True)
    if check_hhfilter.returncode != 0:
        raise RuntimeError("hhfilter not found in WSL. Please install HH-suite.")
    
    subprocess.run(
        [
            "wsl", "hhfilter",
            "-i", to_wsl(raw_cat),
            "-o", to_wsl(filtered),
            "-diff", str(MAX_ROWS)
        ],
        check=True,
        capture_output=True
    )
    print(f"   ✓ Filtered MSA saved to: {filtered}")
    
    # Load and process sequences
    print(f"\n📊 Processing sequences...")
    all_sequences = load_msa_from_a3m(filtered)
    print(f"   Loaded {len(all_sequences)} sequences from hhfilter output")
    
    if not all_sequences:
        raise RuntimeError("No sequences found in filtered A3M file")
    
    # Strip insertions and ensure equal length
    processed_sequences = []
    query_seq_stripped = None
    target_length = None
    
    for i, (description, sequence) in enumerate(all_sequences):
        stripped_seq = strip_insertions_a3m(sequence)
        
        if i == 0:  # First sequence is the query
            query_seq_stripped = stripped_seq
            target_length = len(stripped_seq)
            print(f"   Query sequence length: {target_length}")
        
        if len(stripped_seq) == target_length:
            processed_sequences.append((description, stripped_seq))
    
    print(f"   After length filtering: {len(processed_sequences)} sequences")
    
    if len(processed_sequences) < 2:
        raise RuntimeError(f"Not enough valid sequences after filtering: {len(processed_sequences)}")
    
    # Apply diversity maximizing if needed
    if len(processed_sequences) > MAX_ROWS:
        print(f"   Applying diversity maximizing: {len(processed_sequences)} → {MAX_ROWS}")
        processed_sequences = diversity_maximizing_subsample(processed_sequences, MAX_ROWS)
    
    print(f"\n✅ Final MSA:")
    print(f"   Sequences: {len(processed_sequences)}")
    print(f"   Length: {len(processed_sequences[0][1])}")
    
    # Save final MSA
    final_msa = output_dir / f"final_msa_{MAX_ROWS}.a3m"
    with open(final_msa, "w") as f:
        for desc, seq in processed_sequences:
            f.write(f">{desc}\n{seq}\n")
    print(f"   Saved to: {final_msa}")
    
    # Generate ESM embeddings if requested
    if GENERATE_ESM_EMBED:
        print(f"\n🧠 Generating ESM embeddings...")
        try:
            import torch
            import esm
            
            model, alphabet = esm.pretrained.esm_msa1b_t12_100M_UR50S()
            batch_converter = alphabet.get_batch_converter()
            model.eval()
            
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            model = model.to(device)
            print(f"   Using device: {device}")
            
            # Prepare MSA for ESM
            msa_tuples = [(f"seq_{i}", seq) for i, (_, seq) in enumerate(processed_sequences)]
            labels, strs, tokens = batch_converter([msa_tuples])
            tokens = tokens.to(device)
            
            # Run inference
            with torch.no_grad():
                results = model(tokens, repr_layers=[12])
                msa_embeddings = results["representations"][12].cpu()
            
            # Save embeddings
            embeddings_path = output_dir / f"{pdb_id}_{chain_id}_msa_emb.pt"
            torch.save(msa_embeddings, embeddings_path)
            print(f"   ✓ Embeddings saved to: {embeddings_path}")
            
        except ImportError:
            print("   ⚠️  ESM not installed - skipping embeddings")
        except Exception as e:
            print(f"   ❌ Error generating embeddings: {e}")
    
    print(f"\n🎉 Successfully processed {protein_id}!")
    print(f"   All files saved to: {output_dir}")
    
    return output_dir

# ===== RUN THE PROCESSING =====
# This will run automatically when you execute the cell
try:
    print(f"🚀 Starting processing for protein: {protein_input}")
    output_dir = process_protein(protein_input)
    print(f"\n✅ Success! Files saved to: {output_dir}")
except Exception as e:
    print(f"\n❌ Error: {e}")
    import traceback
    traceback.print_exc()

🚀 Starting processing for protein: 5gup_6

🧬 Processing protein: 5gup_6
   PDB ID: 5gup
   Chain: 6
   Output directory: C:\Users\rfrjo\Documents\Codebases\PFP_Testing\data\openfold_msas\5GUP-6

📥 Downloading MSA files from S3...
   S3 prefix: pdb/5gup_6/a3m/
   Downloading pdb/5gup_6/a3m/bfd_uniclust_hits.a3m... ✓
   Downloading pdb/5gup_6/a3m/mgnify_hits.a3m... ✓
   Downloading pdb/5gup_6/a3m/uniref90_hits.a3m... ✓
   Successfully downloaded 3 MSA files

📋 Concatenating MSA files...
   ✓ Concatenated MSA saved to: C:\Users\rfrjo\Documents\Codebases\PFP_Testing\data\openfold_msas\5GUP-6\all_raw.a3m

🔧 Running hhfilter...
   ✓ Filtered MSA saved to: C:\Users\rfrjo\Documents\Codebases\PFP_Testing\data\openfold_msas\5GUP-6\all_hhfiltered_256.a3m

📊 Processing sequences...
   Loaded 638 sequences from hhfilter output
   Query sequence length: 453
   After length filtering: 638 sequences
   Applying diversity maximizing: 638 → 256

✅ Final MSA:
   Sequences: 256
   Length: 453
   Saved to:

In [16]:
# %% ---------------------------------------------------------------------
# Sanity-check: query vs reference identity for every processed protein
# ------------------------------------------------------------------------
from pathlib import Path
from tqdm.auto import tqdm
import textwrap

# ─── CONFIG ─────────────────────────────────────────────────────────────
ROOT = Path(r"C:\Users\rfrjo\Documents\Codebases\PFP_Testing")
PROTEIN_DIR = ROOT / "data" / "protein_data_pdb"

SKIP_IDS = {"2WWX-B", "3M7G-A", "5GAO-E", "5OXE-A", "1W8X-M"}

A3M_NAME = "final_filtered_256_stripped.a3m"
SEQ_TXT  = "sequence.txt"

PASS_THRESHOLD = 95.0        # ≥ 95 % identity → success

# %% ---------------------------------------------------------------------
# Helper functions
# ------------------------------------------------------------------------
def read_first_seq_from_a3m(path: Path) -> str:
    """
    Return the first sequence block in an A3M file (query row).
    Ignores lowercase insertions and gaps ('-') by design.
    """
    with path.open() as fh:
        for line in fh:
            if line.startswith(">"):
                break          # header found – next lines are residues
        residues = []
        for line in fh:
            if line.startswith(">"):
                break          # next header → stop
            residues.append(line.strip())
    if not residues:
        raise ValueError("No residues found")
    seq = "".join(residues)
    # The A3M is already stripped, so any leftover lowercase/gap chars
    return seq.replace("-", "").replace(".", "").upper()

def read_sequence_txt(path: Path) -> str:
    with path.open() as fh:
        return fh.read().strip().upper()

def seq_identity(seq1: str, seq2: str) -> float:
    """Simple global identity on the *shorter* length if unequal."""
    L = min(len(seq1), len(seq2))
    if L == 0:
        return 0.0
    matches = sum(a == b for a, b in zip(seq1[:L], seq2[:L]))
    return matches / L * 100.0

# %% ---------------------------------------------------------------------
# Scan folders, compute identities
# ------------------------------------------------------------------------
all_folders   = sorted(p for p in PROTEIN_DIR.iterdir() if p.is_dir())
folders_to_do = [p for p in all_folders if p.name not in SKIP_IDS]

results = []           # keep (id, identity, status, msg) for summary
errors  = []

print(f"Checking {len(folders_to_do):,} proteins (skipping {len(SKIP_IDS)})…\n")

for folder in tqdm(folders_to_do, unit="protein"):
    pid = folder.name
    try:
        a3m_path = folder / A3M_NAME
        ref_path = folder / SEQ_TXT

        if not a3m_path.exists():
            raise FileNotFoundError(f"{A3M_NAME} missing")
        if not ref_path.exists():
            raise FileNotFoundError(f"{SEQ_TXT} missing")

        query_seq = read_first_seq_from_a3m(a3m_path)
        ref_seq   = read_sequence_txt(ref_path)
        ident     = seq_identity(query_seq, ref_seq)

        status = "PASS" if ident >= PASS_THRESHOLD else "FAIL"
        results.append((pid, ident, status, ""))

    except Exception as e:
        # Record as failure with explanatory message
        results.append((pid, 0.0, "ERROR", str(e)))
        errors.append((pid, str(e)))

# %% ---------------------------------------------------------------------
# Report
# ------------------------------------------------------------------------
n = len(results)
n_pass  = sum(1 for r in results if r[2] == "PASS")
n_fail  = sum(1 for r in results if r[2] == "FAIL")
n_error = sum(1 for r in results if r[2] == "ERROR")

print(textwrap.dedent(f"""
────────────────────────────────────────────────────────────────────────
SUMMARY
────────────────────────────────────────────────────────────────────────
 total checked : {n}
   PASS (≥ {PASS_THRESHOLD:.1f} %) : {n_pass}
   FAIL (< {PASS_THRESHOLD:.1f} %) : {n_fail}
   ERROR (I/O etc.) : {n_error}
 success rate  : {n_pass/n*100:5.1f} %
────────────────────────────────────────────────────────────────────────
"""))

if n_fail:
    print("❌  Proteins below threshold:")
    for pid, ident, _, _ in sorted(r for r in results if r[2] == "FAIL",
                                   key=lambda x: x[1]):
        print(f"   {pid:<10}  {ident:6.2f}%")

if n_error:
    print("\n⚠️  Errors:")
    for pid, msg in errors:
        print(f"   {pid:<10}  {msg}")


SyntaxError: Generator expression must be parenthesized (195751666.py, line 110)

In [None]:
# %% ---------------------------------------------------------------------
# CONFIG – change paths or limits here
# ------------------------------------------------------------------------
from pathlib import Path
import csv, os, re, subprocess, tempfile
from collections import defaultdict
import boto3
from botocore.config import Config
from botocore import UNSIGNED
from tqdm.auto import tqdm
import numpy as np
from scipy.spatial.distance import pdist, squareform

ROOT = Path(r"C:\Users\rfrjo\Documents\Codebases\PFP_Testing")
PROTEIN_DIR = ROOT / "data" / "protein_data_pdb"
MATCH_FILE  = ROOT / "data" / "protein_matching_results_counts.tsv"
UPDATED_OUT = ROOT / "data" / "protein_matching_results_counts_updated.tsv"

MAX_ROWS    = 256          # target sequences in final MSA
MAX_CHAINS  = 2         # e.g. 10 → process first 10 chains, None → all
BUCKET      = "openfold"
A3M_FILES   = ("bfd_uniclust_hits.a3m", "mgnify_hits.a3m", "uniref90_hits.a3m")

# ------------------------------------------------------------------------
# HELPERS
# ------------------------------------------------------------------------
def parse_chain_id(s: str):
    s = s.strip()
    if '-' in s:
        pdb, chain = s.split('-')
    elif '_' in s:
        pdb, chain = s.split('_')
    elif re.fullmatch(r"[0-9][A-Za-z0-9]{3}[A-Za-z]", s):
        pdb, chain = s[:4], s[4]
    else:
        raise ValueError(f"Cannot parse chain ID from '{s}'")
    return pdb.upper(), chain.upper()

def to_wsl(p: Path) -> str:
    posix = p.resolve().as_posix()
    return posix if posix.startswith("/mnt/") else f"/mnt/{p.drive[0].lower()}{posix[2:]}"

LOWER = ''.join(chr(c) for c in range(97,123))  # ascii_lowercase
def strip_insertions_a3m(seq: str) -> str:
    """Remove lowercase insertions, '.', '*'."""
    return seq.translate({ord(c): None for c in LOWER + ".*"})

def load_msa_from_a3m(path):
    msa, hdr, seq = [], None, []
    with open(path) as fh:
        for line in fh:
            line = line.rstrip()
            if line.startswith('>'):
                if hdr is not None:
                    msa.append((hdr, ''.join(seq)))
                hdr, seq = line[1:], []
            else:
                seq.append(line)
        if hdr is not None:
            msa.append((hdr, ''.join(seq)))
    return msa

def diversity_max_subsample(msa, k):
    """Greedy max-Hamming subsample to k rows (msa = [(hdr, seq), …])."""
    if len(msa) <= k:
        return msa
    seqs = np.array([list(s) for _, s in msa], dtype='U1')
    uniq = {aa:i for i, aa in enumerate(sorted({c for row in seqs for c in row}))}
    arr  = np.vectorize(uniq.get)(seqs)
    dist = squareform(pdist(arr, metric='hamming'))
    keep = [0]                              # always keep query row
    selected = np.zeros(len(msa), bool); selected[0] = True
    while selected.sum() < k:
        mean = dist[:, selected].mean(1)
        mean[selected] = -1
        idx = int(mean.argmax())
        if mean[idx] <= 0: break
        selected[idx] = True
        keep.append(idx)
    return [msa[i] for i in keep]

# ------------------------------------------------------------------------
# S3 client (anonymous)
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

# ----------------- read mapping once ------------------------------------
with open(MATCH_FILE, newline='') as fh:
    reader = csv.DictReader(fh, delimiter='\t')
    mapping = {row['original_id'].strip(): row for row in reader}

# sanity: hhfilter ?
if subprocess.run(["wsl", "which", "hhfilter"], capture_output=True).returncode:
    raise RuntimeError("hhfilter not found inside WSL; install HH-suite first")

# ------------------------------------------------------------------------
# MAIN LOOP  –  with a single progress bar
# ------------------------------------------------------------------------
results = []
chain_folders = sorted([p for p in PROTEIN_DIR.iterdir() if p.is_dir()])
if MAX_CHAINS is not None:
    chain_folders = chain_folders[:MAX_CHAINS]

outer_pbar = tqdm(chain_folders, desc="Chains", unit="chain")

for chain_folder in outer_pbar:
    chain_name = chain_folder.name          # e.g. 1A0C-A
    outer_pbar.set_postfix_str(chain_name)

    row_base = mapping.get(chain_name)
    if row_base is None:
        results.append({'original_id': chain_name,
                        'matched_id':'',
                        'n_rows_dropped':'',
                        'hhfilter_rows': '',
                        'diversity_filtered_rows': '',
                        'status':'Failed: mapping missing'})
        continue

    matched_id = row_base['matched_id'].replace('_','-')
    pdb, chain = parse_chain_id(matched_id)
    prefix = f"pdb/{pdb.lower()}_{chain}/a3m/"
    
    tmp_dir = tempfile.TemporaryDirectory()
    tmp_path = Path(tmp_dir.name)

    # download
    downloaded = []
    for fname in A3M_FILES:
        key  = prefix + fname
        dest = tmp_path / fname
        try:
            s3.download_file(BUCKET, key, str(dest))
            downloaded.append(dest)
        except Exception: 
            pass

    if not downloaded:
        results.append({**row_base, 'n_rows_dropped':'',
                        'hhfilter_rows': '', 'diversity_filtered_rows': '',
                        'status':'Failed: no A3M found'})
        tmp_dir.cleanup()
        continue

    # concat into a temporary file
    raw_path = tmp_path / "concat_raw.a3m"
    with open(raw_path, 'w') as out:
        for f in downloaded:
            with open(f) as infile:
                out.write(infile.read())

    # hhfilter
    hh_out = tmp_path / f"hhfiltered_{MAX_ROWS}.a3m"
    subprocess.run(["wsl", "hhfilter",
                    "-i", to_wsl(raw_path),
                    "-o", to_wsl(hh_out),
                    "-diff", str(MAX_ROWS)],
                   check=True, capture_output=True)

    msa = load_msa_from_a3m(hh_out)
    hhfilter_rows = len(msa)
    
    if not msa:
        results.append({**row_base,'n_rows_dropped':'',
                        'hhfilter_rows': hhfilter_rows, 'diversity_filtered_rows': '',
                        'status':'Failed: hhfilter empty'})
        tmp_dir.cleanup()
        continue

    # strip insertions
    target_len = len(strip_insertions_a3m(msa[0][1]))
    kept, dropped = [], 0

    for hdr, seq in msa:
        clean = strip_insertions_a3m(seq)
        if len(clean) == target_len:
            kept.append((hdr, clean))
        else:
            dropped += 1

    if len(kept) < 2:
        results.append({**row_base,
                        'n_rows_dropped': dropped,
                        'hhfilter_rows': hhfilter_rows,
                        'diversity_filtered_rows': '',
                        'status': f'Failed: only {len(kept)} rows after stripping'})
        tmp_dir.cleanup()
        continue
    
    rows_before_diversity_filter = len(kept)
    diversity_filtered_rows = 0
    if rows_before_diversity_filter > MAX_ROWS:
        kept = diversity_max_subsample(kept, MAX_ROWS)
        diversity_filtered_rows = rows_before_diversity_filter - len(kept)

    # save final A3M
    final_path = chain_folder / "final_filtered_256_stripped.a3m"
    with open(final_path, 'w') as fh:
        for hdr, seq in kept:
            fh.write(f">{hdr}\n{seq}\n")

    results.append({**row_base,
                    'n_rows_dropped': dropped,
                    'hhfilter_rows': hhfilter_rows,
                    'diversity_filtered_rows': diversity_filtered_rows,
                    'status':'Success'})
    tmp_dir.cleanup()

# ------------------------------------------------------------------------
# WRITE UPDATED TSV
# ------------------------------------------------------------------------
orig_cols  = list(next(iter(mapping.values())).keys())
extra_cols = ['n_rows_dropped', 'hhfilter_rows', 'diversity_filtered_rows', 'status']
fieldnames = orig_cols + [c for c in extra_cols if c not in orig_cols]

with open(UPDATED_OUT, 'w', newline='') as fh:
    writer = csv.DictWriter(fh, fieldnames=fieldnames, delimiter='\t')
    writer.writeheader()
    for row in results:
        writer.writerow(row)

print(f"\n✅ Finished.  Log → {UPDATED_OUT}")

Chains:   0%|          | 0/2 [00:00<?, ?chain/s]

DL 154L-A:   0%|          | 0/3 [00:00<?, ?file/s]

Strip 154L-A:   0%|          | 0/434 [00:00<?, ?it/s]

DL 155C-A:   0%|          | 0/3 [00:00<?, ?file/s]

Strip 155C-A:   0%|          | 0/323 [00:00<?, ?it/s]


✅ Finished.  Log → C:\Users\rfrjo\Documents\Codebases\PFP_Testing\data\protein_matching_results_counts_updated.tsv


In [2]:
# batch_msa_pipeline.py – identical biological logic, batched I/O with RESUMABILITY
# -----------------------------------------------------------------------------
# 1.  Grabs A3M files in batches of BATCH_SIZE chains.
# 2.  Downloads each batch concurrently (up to DL_CONCURRENCY workers).
# 3.  Runs hhfilter / strip / diversity on each chain (HH_PARALLEL chains in
#     parallel so you do not oversubscribe CPU).
# 4.  SAVES PROGRESS after each batch - can be interrupted and resumed!
#
# Expected runtime on a 1 Gbit link & 16‑core CPU: 25 h → 4‑5 h end‑to‑end.
# -----------------------------------------------------------------------------

from __future__ import annotations

import csv, os, re, shutil, subprocess, tempfile, itertools, math
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Tuple, Dict, Set

import boto3
from boto3.s3.transfer import TransferConfig
from botocore.config import Config
from botocore import UNSIGNED

import numpy as np
from scipy.spatial.distance import pdist, squareform
from tqdm.auto import tqdm

# ─── CONFIG ──────────────────────────────────────────────────────────────
ROOT          = Path(r"C:\Users\rfrjo\Documents\Codebases\PFP_Testing")
PROTEIN_DIR   = ROOT / "data" / "protein_data_pdb"
MATCH_FILE    = ROOT / "data" / "protein_matching_results_counts.tsv"
UPDATED_OUT   = ROOT / "data" / "protein_matching_results_counts_updated.tsv"

MAX_ROWS      = 256          # target sequences post‑hhfilter
MAX_CHAINS    = None         # set to small int for quick tests, None for all

# batching / concurrency
BATCH_SIZE       = 50        # chains per network batch
DL_CONCURRENCY   = 14         # simultaneous S3 downloads 24,10
HH_PARALLEL      = 10          # chains processed in parallel (CPU‑bound)

BUCKET        = "openfold"
A3M_FILES     = ("bfd_uniclust_hits.a3m", "mgnify_hits.a3m", "uniref90_hits.a3m")

# ─── HELPERS ─────────────────────────────────────────────────────────────
LOWER = ''.join(chr(c) for c in range(97, 123))


def parse_chain_id(s: str) -> Tuple[str, str]:
    s = s.strip()
    if '-' in s:
        pdb, chain = s.split('-')
    elif '_' in s:
        pdb, chain = s.split('_')
    elif re.fullmatch(r"[0-9][A-Za-z0-9]{3}[A-Za-z]", s):
        pdb, chain = s[:4], s[4]
    else:
        raise ValueError(f"Cannot parse chain ID from '{s}'")
    return pdb.upper(), chain.upper()


def to_wsl(p: Path) -> str:
    posix = p.resolve().as_posix()
    return posix if posix.startswith("/mnt/") else f"/mnt/{p.drive[0].lower()}{posix[2:]}"


def strip_insertions_a3m(seq: str) -> str:
    return seq.translate({ord(c): None for c in LOWER + ".*"})


def load_msa_from_a3m(path: Path):
    msa, hdr, seq = [], None, []
    with path.open() as fh:
        for line in fh:
            line = line.rstrip()
            if line.startswith('>'):
                if hdr is not None:
                    msa.append((hdr, ''.join(seq)))
                hdr, seq = line[1:], []
            else:
                seq.append(line)
        if hdr is not None:
            msa.append((hdr, ''.join(seq)))
    return msa


def diversity_max_subsample(msa, k):
    if len(msa) <= k:
        return msa
    seqs = np.array([list(s) for _, s in msa], dtype='U1')
    uniq = {aa: i for i, aa in enumerate(sorted({c for row in seqs for c in row}))}
    arr = np.vectorize(uniq.get)(seqs)
    dist = squareform(pdist(arr, metric='hamming'))
    keep = [0]
    selected = np.zeros(len(msa), bool); selected[0] = True
    while selected.sum() < k:
        mean = dist[:, selected].mean(1)
        mean[selected] = -1
        idx = int(mean.argmax())
        if mean[idx] <= 0:
            break
        selected[idx] = True
        keep.append(idx)
    return [msa[i] for i in keep]

# ─── RESUMABILITY FUNCTIONS ──────────────────────────────────────────────

def load_existing_results() -> Tuple[Set[str], List[str]]:
    """Load already processed proteins from existing TSV file."""
    if not UPDATED_OUT.exists():
        return set(), []
    
    processed = set()
    fieldnames = []
    
    # Define permanent failures that shouldn't be retried
    PERMANENT_FAILURES = {
        'Failed: mapping missing',
        'Failed: invalid chain ID'
    }
    
    try:
        with UPDATED_OUT.open('r', newline='') as fh:
            reader = csv.DictReader(fh, delimiter='\t')
            fieldnames = reader.fieldnames or []
            successful = 0
            permanent_fails = 0
            retryable_fails = 0
            
            for row in reader:
                protein_id = row['original_id'].strip()
                status = row.get('status', '')
                
                if status == 'Success':
                    processed.add(protein_id)
                    successful += 1
                elif any(perm_fail in status for perm_fail in PERMANENT_FAILURES):
                    # Don't retry permanent failures
                    processed.add(protein_id)
                    permanent_fails += 1
                else:
                    # Retry temporary failures (network issues, etc.)
                    retryable_fails += 1
        
        print(f"📊 Found existing results:")
        print(f"   ✅ Successful: {successful}")
        print(f"   ❌ Permanent failures (skipped): {permanent_fails}")
        print(f"   🔄 Retryable failures: {retryable_fails}")
        print(f"   📋 Total to skip: {len(processed)}")
        
        return processed, fieldnames
        
    except Exception as e:
        print(f"⚠️  Error reading existing results: {e}")
        return set(), []


def save_batch_results(batch_results: List[Dict[str, str]], fieldnames: List[str], is_first_batch: bool = False):
    """Append batch results to TSV file."""
    mode = 'w' if is_first_batch else 'a'
    
    with UPDATED_OUT.open(mode, newline='') as fh:
        writer = csv.DictWriter(fh, fieldnames=fieldnames, delimiter='\t')
        if is_first_batch:
            writer.writeheader()
        writer.writerows(batch_results)


def initialize_fieldnames(mapping: Dict[str, Dict[str, str]]) -> List[str]:
    """Initialize fieldnames for TSV output."""
    orig_cols = list(next(iter(mapping.values())).keys())
    extra_cols = ['n_rows_dropped', 'hhfilter_rows', 'diversity_filtered_rows', 'status']
    return orig_cols + [c for c in extra_cols if c not in orig_cols]

# ─── AWS S3 CLIENT ───────────────────────────────────────────────────────
transfer_cfg = TransferConfig(max_concurrency=DL_CONCURRENCY)
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

# ─── READ MAPPING ONCE ───────────────────────────────────────────────────
with open(MATCH_FILE, newline='') as fh:
    mapping: Dict[str, Dict[str, str]] = {
        row['original_id'].strip(): row for row in csv.DictReader(fh, delimiter='\t')
    }

# sanity: hhfilter available?
if subprocess.run(["wsl", "which", "hhfilter"], capture_output=True).returncode:
    raise RuntimeError("hhfilter not found inside WSL; install HH-suite first")

# ─── MAIN FUNCTIONS ──────────────────────────────────────────────────────

def download_one(key: str, dest: Path) -> bool:
    try:
        dest.parent.mkdir(parents=True, exist_ok=True)
        s3.download_file(BUCKET, key, str(dest), Config=transfer_cfg)
        return True
    except Exception:
        return False


def process_chain(chain_folder: Path, row_base: Dict[str, str]) -> Dict[str, str]:
    """Runs hhfilter→strip→diversity on one chain. Expects A3M files already local."""
    try:
        matched_id = row_base['matched_id'].replace('_', '-')
        pdb, chain = parse_chain_id(matched_id)
        a3m_dir = chain_folder / "_tmp_a3m"  # local staging dir inside chain folder

        downloaded = [(a3m_dir / f) for f in A3M_FILES if (a3m_dir / f).exists()]
        if not downloaded:
            return {**row_base, **{
                'n_rows_dropped': '',
                'hhfilter_rows': '',
                'diversity_filtered_rows': '',
                'status': 'Failed: no A3M found'
            }}

        # concat → hhfilter
        raw_path = a3m_dir / "concat_raw.a3m"
        with raw_path.open('w') as out:
            for f in downloaded:
                out.write(f.read_text())

        hh_out = a3m_dir / f"hhfiltered_{MAX_ROWS}.a3m"
        subprocess.run([
            "wsl", "hhfilter", "-i", to_wsl(raw_path), "-o", to_wsl(hh_out),
            "-diff", str(MAX_ROWS)
        ], check=True, capture_output=True)

        msa = load_msa_from_a3m(hh_out)
        hh_rows = len(msa)
        if not msa:
            return {**row_base, **{
                'n_rows_dropped': '',
                'hhfilter_rows': hh_rows,
                'diversity_filtered_rows': '',
                'status': 'Failed: hhfilter empty'
            }}

        target_len = len(strip_insertions_a3m(msa[0][1]))
        kept, dropped = [], 0
        for hdr, seq in msa:
            clean = strip_insertions_a3m(seq)
            if len(clean) == target_len:
                kept.append((hdr, clean))
            else:
                dropped += 1
        if len(kept) < 2:
            return {**row_base, **{
                'n_rows_dropped': dropped,
                'hhfilter_rows': hh_rows,
                'diversity_filtered_rows': '',
                'status': f'Failed: only {len(kept)} rows after stripping'
            }}

        rows_before_div = len(kept)
        div_filtered = 0
        if rows_before_div > MAX_ROWS:
            kept = diversity_max_subsample(kept, MAX_ROWS)
            div_filtered = rows_before_div - len(kept)

        final_path = chain_folder / "final_filtered_256_stripped.a3m"
        with final_path.open('w') as fh:
            for hdr, seq in kept:
                fh.write(f">{hdr}\n{seq}\n")

        return {**row_base, **{
            'n_rows_dropped': dropped,
            'hhfilter_rows': hh_rows,
            'diversity_filtered_rows': div_filtered,
            'status': 'Success'
        }}
    
    except Exception as e:
        return {**row_base, **{
            'n_rows_dropped': '',
            'hhfilter_rows': '',
            'diversity_filtered_rows': '',
            'status': f'Failed: {str(e)[:100]}'
        }}
    
    finally:
        # cleanup tmp dir
        a3m_dir = chain_folder / "_tmp_a3m"
        if a3m_dir.exists():
            shutil.rmtree(a3m_dir, ignore_errors=True)


# ─── BATCHED DRIVER ──────────────────────────────────────────────────────

def grouper(n: int, iterable):
    """Yield successive n‑sized chunks."""
    it = iter(iterable)
    while True:
        chunk = list(itertools.islice(it, n))
        if not chunk:
            return
        yield chunk


def main():
    print("🧬 Starting Resumable MSA Pipeline...")
    
    # Load existing results to determine what's already processed
    already_processed, existing_fieldnames = load_existing_results()
    
    # Get all chain folders
    all_chain_folders = sorted([p for p in PROTEIN_DIR.iterdir() if p.is_dir()])
    if MAX_CHAINS is not None:
        all_chain_folders = all_chain_folders[:MAX_CHAINS]
    
    # Filter out already processed chains
    chain_folders = [cf for cf in all_chain_folders if cf.name not in already_processed]
    
    # Initialize fieldnames
    fieldnames = existing_fieldnames if existing_fieldnames else initialize_fieldnames(mapping)
    
    # Progress tracking
    total_proteins = len(all_chain_folders)
    already_done = len(already_processed)
    remaining = len(chain_folders)
    
    print(f"📈 Progress Status:")
    print(f"   Total proteins: {total_proteins}")
    print(f"   Already processed: {already_done}")
    print(f"   Remaining: {remaining}")
    
    if remaining == 0:
        print("✅ All proteins already processed!")
        return
    
    print(f"🚀 Resuming from protein {already_done + 1}/{total_proteins}")
    print(f"   Batch size: {BATCH_SIZE}")
    print(f"   Download workers: {DL_CONCURRENCY}")
    print(f"   Processing workers: {HH_PARALLEL}")
    print()
    
    # Setup progress bar starting from current position
    outer = tqdm(
        total=total_proteins, 
        initial=already_done,
        desc="Chains", 
        unit="chain",
        position=0
    )
    
    is_first_batch = (already_done == 0)
    
    try:
        for batch_num, batch in enumerate(grouper(BATCH_SIZE, chain_folders), 1):
            batch_results = []
            
            # 1) stage download jobs for the whole batch
            dl_jobs = []
            valid_chains = []
            
            for cf in batch:
                row_base = mapping.get(cf.name)
                if not row_base:
                    batch_results.append({'original_id': cf.name, 'matched_id': '',
                                        'n_rows_dropped': '', 'hhfilter_rows': '',
                                        'diversity_filtered_rows': '',
                                        'status': 'Failed: mapping missing'})
                    outer.update(1)  # keep bar correct
                    continue
                
                try:
                    matched_id = row_base['matched_id'].replace('_', '-')
                    pdb, chain = parse_chain_id(matched_id)
                    a3m_dir = cf / "_tmp_a3m"
                    for fname in A3M_FILES:
                        key = f"pdb/{pdb.lower()}_{chain}/a3m/{fname}"
                        dest = a3m_dir / fname
                        dl_jobs.append((key, dest))
                    valid_chains.append(cf)
                except Exception as e:
                    batch_results.append({**row_base, **{
                        'n_rows_dropped': '',
                        'hhfilter_rows': '',
                        'diversity_filtered_rows': '',
                        'status': f'Failed: {str(e)[:100]}'
                    }})
                    outer.update(1)  # keep bar correct

            # 2) concurrent downloads
            if dl_jobs:
                with ThreadPoolExecutor(max_workers=DL_CONCURRENCY) as pool:
                    futs = {pool.submit(download_one, k, d): (k, d) for k, d in dl_jobs}
                    download_progress = tqdm(
                        total=len(dl_jobs), 
                        desc=f"Batch {batch_num} Downloads", 
                        unit="file",
                        leave=False,
                        position=1
                    )
                    
                    for fut in as_completed(futs):
                        download_progress.update(1)
                    download_progress.close()

            # 3) process each chain (HH_PARALLEL in parallel)
            if valid_chains:
                with ThreadPoolExecutor(max_workers=HH_PARALLEL) as pool:
                    futs2 = {
                        pool.submit(process_chain, cf, mapping[cf.name]): cf
                        for cf in valid_chains if cf.name in mapping
                    }
                    
                    process_progress = tqdm(
                        total=len(futs2), 
                        desc=f"Batch {batch_num} Processing", 
                        unit="protein",
                        leave=False,
                        position=1
                    )
                    
                    for fut in as_completed(futs2):
                        res = fut.result()
                        batch_results.append(res)
                        outer.update(1)
                        outer.set_postfix_str(f"Latest: {res['original_id']} ({res['status']})")
                        process_progress.update(1)
                    
                    process_progress.close()

            # 4) Save batch results immediately
            if batch_results:
                save_batch_results(batch_results, fieldnames, is_first_batch)
                is_first_batch = False
                
                # Show batch summary
                success_count = sum(1 for r in batch_results if r['status'] == 'Success')
                print(f"📝 Batch {batch_num} saved: {success_count}/{len(batch_results)} successful")

    except KeyboardInterrupt:
        print(f"\n⏸️  Process interrupted! Progress saved to: {UPDATED_OUT}")
        print(f"   You can resume by running this script again.")
        outer.close()
        return
    
    except Exception as e:
        print(f"\n❌ Error occurred: {e}")
        print(f"   Progress saved to: {UPDATED_OUT}")
        outer.close()
        raise
    
    outer.close()
    print(f"\n✅ Finished! Results saved to: {UPDATED_OUT}")


if __name__ == "__main__":
    main()

🧬 Starting Resumable MSA Pipeline...
📊 Found existing results:
   ✅ Successful: 12796
   ❌ Permanent failures (skipped): 0
   🔄 Retryable failures: 4
   📋 Total to skip: 12796
📈 Progress Status:
   Total proteins: 29740
   Already processed: 12796
   Remaining: 16944
🚀 Resuming from protein 12797/29740
   Batch size: 50
   Download workers: 14
   Processing workers: 10



Chains:  43%|####3     | 12796/29740 [00:00<?, ?chain/s]

Batch 1 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 1 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 1 saved: 48/50 successful


Batch 2 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 2 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 2 saved: 50/50 successful


Batch 3 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 3 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 3 saved: 47/50 successful


Batch 4 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 4 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 4 saved: 50/50 successful


Batch 5 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 5 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 5 saved: 45/50 successful


Batch 6 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 6 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 6 saved: 43/50 successful


Batch 7 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 7 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 7 saved: 48/50 successful


Batch 8 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 8 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 8 saved: 50/50 successful


Batch 9 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 9 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 9 saved: 50/50 successful


Batch 10 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 10 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 10 saved: 50/50 successful


Batch 11 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 11 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 11 saved: 50/50 successful


Batch 12 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 12 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 12 saved: 50/50 successful


Batch 13 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 13 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 13 saved: 50/50 successful


Batch 14 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 14 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 14 saved: 50/50 successful


Batch 15 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 15 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 15 saved: 50/50 successful


Batch 16 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 16 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 16 saved: 50/50 successful


Batch 17 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 17 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 17 saved: 50/50 successful


Batch 18 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 18 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 18 saved: 50/50 successful


Batch 19 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 19 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 19 saved: 50/50 successful


Batch 20 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 20 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 20 saved: 49/50 successful


Batch 21 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 21 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 21 saved: 50/50 successful


Batch 22 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 22 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 22 saved: 50/50 successful


Batch 23 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 23 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 23 saved: 50/50 successful


Batch 24 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 24 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 24 saved: 50/50 successful


Batch 25 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 25 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 25 saved: 50/50 successful


Batch 26 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 26 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 26 saved: 50/50 successful


Batch 27 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 27 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 27 saved: 50/50 successful


Batch 28 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 28 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 28 saved: 50/50 successful


Batch 29 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 29 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 29 saved: 50/50 successful


Batch 30 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 30 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 30 saved: 50/50 successful


Batch 31 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 31 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 31 saved: 50/50 successful


Batch 32 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 32 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 32 saved: 50/50 successful


Batch 33 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 33 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 33 saved: 50/50 successful


Batch 34 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 34 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 34 saved: 50/50 successful


Batch 35 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 35 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 35 saved: 50/50 successful


Batch 36 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 36 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 36 saved: 50/50 successful


Batch 37 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 37 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 37 saved: 50/50 successful


Batch 38 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 38 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 38 saved: 50/50 successful


Batch 39 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 39 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 39 saved: 50/50 successful


Batch 40 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 40 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 40 saved: 50/50 successful


Batch 41 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 41 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 41 saved: 50/50 successful


Batch 42 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 42 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 42 saved: 50/50 successful


Batch 43 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 43 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 43 saved: 50/50 successful


Batch 44 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 44 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 44 saved: 50/50 successful


Batch 45 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 45 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 45 saved: 50/50 successful


Batch 46 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 46 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 46 saved: 50/50 successful


Batch 47 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 47 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 47 saved: 50/50 successful


Batch 48 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 48 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 48 saved: 50/50 successful


Batch 49 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 49 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 49 saved: 50/50 successful


Batch 50 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 50 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 50 saved: 50/50 successful


Batch 51 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 51 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 51 saved: 50/50 successful


Batch 52 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 52 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 52 saved: 50/50 successful


Batch 53 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 53 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 53 saved: 50/50 successful


Batch 54 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 54 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 54 saved: 50/50 successful


Batch 55 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 55 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 55 saved: 50/50 successful


Batch 56 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 56 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 56 saved: 50/50 successful


Batch 57 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 57 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 57 saved: 50/50 successful


Batch 58 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 58 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 58 saved: 50/50 successful


Batch 59 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 59 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 59 saved: 50/50 successful


Batch 60 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 60 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 60 saved: 50/50 successful


Batch 61 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 61 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 61 saved: 50/50 successful


Batch 62 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 62 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 62 saved: 50/50 successful


Batch 63 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 63 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 63 saved: 50/50 successful


Batch 64 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 64 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 64 saved: 50/50 successful


Batch 65 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 65 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 65 saved: 50/50 successful


Batch 66 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 66 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 66 saved: 50/50 successful


Batch 67 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 67 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 67 saved: 50/50 successful


Batch 68 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 68 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 68 saved: 50/50 successful


Batch 69 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 69 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 69 saved: 50/50 successful


Batch 70 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 70 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 70 saved: 50/50 successful


Batch 71 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 71 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 71 saved: 50/50 successful


Batch 72 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 72 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 72 saved: 50/50 successful


Batch 73 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 73 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 73 saved: 50/50 successful


Batch 74 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 74 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 74 saved: 50/50 successful


Batch 75 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 75 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 75 saved: 50/50 successful


Batch 76 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 76 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 76 saved: 50/50 successful


Batch 77 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 77 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 77 saved: 50/50 successful


Batch 78 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 78 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 78 saved: 50/50 successful


Batch 79 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 79 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 79 saved: 50/50 successful


Batch 80 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 80 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 80 saved: 50/50 successful


Batch 81 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 81 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 81 saved: 50/50 successful


Batch 82 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 82 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 82 saved: 50/50 successful


Batch 83 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 83 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 83 saved: 50/50 successful


Batch 84 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 84 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 84 saved: 49/50 successful


Batch 85 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 85 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 85 saved: 50/50 successful


Batch 86 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 86 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 86 saved: 50/50 successful


Batch 87 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 87 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 87 saved: 49/50 successful


Batch 88 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 88 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 88 saved: 50/50 successful


Batch 89 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 89 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 89 saved: 50/50 successful


Batch 90 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 90 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 90 saved: 50/50 successful


Batch 91 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 91 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 91 saved: 50/50 successful


Batch 92 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 92 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 92 saved: 50/50 successful


Batch 93 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 93 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 93 saved: 50/50 successful


Batch 94 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 94 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 94 saved: 50/50 successful


Batch 95 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 95 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 95 saved: 50/50 successful


Batch 96 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 96 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 96 saved: 50/50 successful


Batch 97 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 97 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 97 saved: 50/50 successful


Batch 98 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 98 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 98 saved: 50/50 successful


Batch 99 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 99 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 99 saved: 50/50 successful


Batch 100 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 100 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 100 saved: 50/50 successful


Batch 101 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 101 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 101 saved: 50/50 successful


Batch 102 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 102 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 102 saved: 50/50 successful


Batch 103 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 103 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 103 saved: 50/50 successful


Batch 104 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 104 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 104 saved: 50/50 successful


Batch 105 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 105 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 105 saved: 50/50 successful


Batch 106 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 106 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 106 saved: 50/50 successful


Batch 107 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 107 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 107 saved: 50/50 successful


Batch 108 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 108 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 108 saved: 50/50 successful


Batch 109 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 109 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 109 saved: 50/50 successful


Batch 110 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 110 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 110 saved: 50/50 successful


Batch 111 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 111 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 111 saved: 50/50 successful


Batch 112 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 112 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 112 saved: 50/50 successful


Batch 113 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 113 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 113 saved: 50/50 successful


Batch 114 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 114 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 114 saved: 50/50 successful


Batch 115 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 115 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 115 saved: 50/50 successful


Batch 116 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 116 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 116 saved: 50/50 successful


Batch 117 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 117 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 117 saved: 50/50 successful


Batch 118 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 118 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 118 saved: 50/50 successful


Batch 119 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 119 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 119 saved: 48/50 successful


Batch 120 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 120 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 120 saved: 50/50 successful


Batch 121 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 121 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 121 saved: 50/50 successful


Batch 122 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 122 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 122 saved: 50/50 successful


Batch 123 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 123 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 123 saved: 50/50 successful


Batch 124 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 124 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 124 saved: 50/50 successful


Batch 125 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 125 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 125 saved: 50/50 successful


Batch 126 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 126 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 126 saved: 50/50 successful


Batch 127 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 127 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 127 saved: 50/50 successful


Batch 128 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 128 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 128 saved: 50/50 successful


Batch 129 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 129 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 129 saved: 50/50 successful


Batch 130 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 130 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 130 saved: 50/50 successful


Batch 131 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 131 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 131 saved: 50/50 successful


Batch 132 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 132 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 132 saved: 50/50 successful


Batch 133 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 133 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 133 saved: 50/50 successful


Batch 134 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 134 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 134 saved: 50/50 successful


Batch 135 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 135 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 135 saved: 50/50 successful


Batch 136 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 136 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 136 saved: 50/50 successful


Batch 137 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 137 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 137 saved: 50/50 successful


Batch 138 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 138 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 138 saved: 50/50 successful


Batch 139 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 139 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 139 saved: 50/50 successful


Batch 140 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 140 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 140 saved: 50/50 successful


Batch 141 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 141 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 141 saved: 50/50 successful


Batch 142 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 142 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 142 saved: 50/50 successful


Batch 143 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 143 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 143 saved: 50/50 successful


Batch 144 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 144 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 144 saved: 50/50 successful


Batch 145 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 145 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 145 saved: 50/50 successful


Batch 146 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 146 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 146 saved: 50/50 successful


Batch 147 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 147 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 147 saved: 50/50 successful


Batch 148 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 148 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 148 saved: 50/50 successful


Batch 149 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 149 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 149 saved: 50/50 successful


Batch 150 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 150 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 150 saved: 50/50 successful


Batch 151 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 151 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 151 saved: 50/50 successful


Batch 152 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 152 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 152 saved: 50/50 successful


Batch 153 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 153 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 153 saved: 50/50 successful


Batch 154 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 154 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 154 saved: 50/50 successful


Batch 155 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 155 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 155 saved: 50/50 successful


Batch 156 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 156 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 156 saved: 47/50 successful


Batch 157 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 157 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 157 saved: 49/50 successful


Batch 158 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 158 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 158 saved: 47/50 successful


Batch 159 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 159 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 159 saved: 47/50 successful


Batch 160 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 160 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 160 saved: 46/50 successful


Batch 161 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 161 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 161 saved: 50/50 successful


Batch 162 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 162 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 162 saved: 45/50 successful


Batch 163 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 163 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 163 saved: 48/50 successful


Batch 164 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 164 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 164 saved: 50/50 successful


Batch 165 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 165 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 165 saved: 50/50 successful


Batch 166 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 166 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 166 saved: 50/50 successful


Batch 167 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 167 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 167 saved: 50/50 successful


Batch 168 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 168 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 168 saved: 50/50 successful


Batch 169 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 169 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 169 saved: 50/50 successful


Batch 170 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 170 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 170 saved: 50/50 successful


Batch 171 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 171 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 171 saved: 50/50 successful


Batch 172 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 172 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 172 saved: 50/50 successful


Batch 173 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 173 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 173 saved: 50/50 successful


Batch 174 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 174 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 174 saved: 50/50 successful


Batch 175 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 175 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 175 saved: 50/50 successful


Batch 176 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 176 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 176 saved: 50/50 successful


Batch 177 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 177 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 177 saved: 50/50 successful


Batch 178 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 178 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 178 saved: 50/50 successful


Batch 179 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 179 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 179 saved: 50/50 successful


Batch 180 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 180 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 180 saved: 50/50 successful


Batch 181 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 181 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 181 saved: 50/50 successful


Batch 182 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 182 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 182 saved: 50/50 successful


Batch 183 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 183 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 183 saved: 48/50 successful


Batch 184 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 184 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 184 saved: 46/50 successful


Batch 185 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 185 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 185 saved: 49/50 successful


Batch 186 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 186 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 186 saved: 50/50 successful


Batch 187 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 187 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 187 saved: 50/50 successful


Batch 188 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 188 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 188 saved: 50/50 successful


Batch 189 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 189 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 189 saved: 50/50 successful


Batch 190 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 190 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 190 saved: 50/50 successful


Batch 191 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 191 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 191 saved: 50/50 successful


Batch 192 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 192 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 192 saved: 50/50 successful


Batch 193 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 193 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 193 saved: 50/50 successful


Batch 194 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 194 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 194 saved: 49/50 successful


Batch 195 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 195 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 195 saved: 50/50 successful


Batch 196 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 196 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 196 saved: 50/50 successful


Batch 197 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 197 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 197 saved: 50/50 successful


Batch 198 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 198 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 198 saved: 50/50 successful


Batch 199 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 199 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 199 saved: 50/50 successful


Batch 200 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 200 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 200 saved: 50/50 successful


Batch 201 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 201 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 201 saved: 50/50 successful


Batch 202 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 202 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 202 saved: 50/50 successful


Batch 203 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 203 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 203 saved: 50/50 successful


Batch 204 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 204 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 204 saved: 48/50 successful


Batch 205 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 205 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 205 saved: 50/50 successful


Batch 206 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 206 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 206 saved: 50/50 successful


Batch 207 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 207 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 207 saved: 46/50 successful


Batch 208 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 208 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 208 saved: 37/50 successful


Batch 209 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 209 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 209 saved: 36/50 successful


Batch 210 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 210 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 210 saved: 49/50 successful


Batch 211 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 211 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 211 saved: 50/50 successful


Batch 212 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 212 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 212 saved: 50/50 successful


Batch 213 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 213 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 213 saved: 50/50 successful


Batch 214 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 214 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 214 saved: 50/50 successful


Batch 215 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 215 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 215 saved: 50/50 successful


Batch 216 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 216 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 216 saved: 50/50 successful


Batch 217 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 217 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 217 saved: 50/50 successful


Batch 218 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 218 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 218 saved: 49/50 successful


Batch 219 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 219 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 219 saved: 50/50 successful


Batch 220 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 220 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 220 saved: 49/50 successful


Batch 221 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 221 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 221 saved: 50/50 successful


Batch 222 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 222 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 222 saved: 50/50 successful


Batch 223 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 223 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 223 saved: 48/50 successful


Batch 224 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 224 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 224 saved: 50/50 successful


Batch 225 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 225 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 225 saved: 48/50 successful


Batch 226 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 226 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 226 saved: 49/50 successful


Batch 227 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 227 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 227 saved: 50/50 successful


Batch 228 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 228 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 228 saved: 47/50 successful


Batch 229 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 229 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 229 saved: 50/50 successful


Batch 230 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 230 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 230 saved: 49/50 successful


Batch 231 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 231 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 231 saved: 42/50 successful


Batch 232 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 232 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 232 saved: 45/50 successful


Batch 233 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 233 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 233 saved: 47/50 successful


Batch 234 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 234 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 234 saved: 46/50 successful


Batch 235 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 235 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 235 saved: 50/50 successful


Batch 236 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 236 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 236 saved: 50/50 successful


Batch 237 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 237 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 237 saved: 39/50 successful


Batch 238 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 238 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 238 saved: 50/50 successful


Batch 239 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 239 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 239 saved: 50/50 successful


Batch 240 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 240 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 240 saved: 50/50 successful


Batch 241 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 241 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 241 saved: 38/50 successful


Batch 242 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 242 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 242 saved: 47/50 successful


Batch 243 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 243 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 243 saved: 50/50 successful


Batch 244 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 244 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 244 saved: 48/50 successful


Batch 245 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 245 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 245 saved: 49/50 successful


Batch 246 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 246 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 246 saved: 50/50 successful


Batch 247 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 247 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 247 saved: 50/50 successful


Batch 248 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 248 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 248 saved: 35/50 successful


Batch 249 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 249 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 249 saved: 41/50 successful


Batch 250 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 250 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 250 saved: 49/50 successful


Batch 251 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 251 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 251 saved: 47/50 successful


Batch 252 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 252 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 252 saved: 41/50 successful


Batch 253 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 253 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 253 saved: 50/50 successful


Batch 254 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 254 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 254 saved: 49/50 successful


Batch 255 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 255 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 255 saved: 50/50 successful


Batch 256 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 256 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 256 saved: 50/50 successful


Batch 257 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 257 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 257 saved: 50/50 successful


Batch 258 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 258 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 258 saved: 50/50 successful


Batch 259 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 259 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 259 saved: 50/50 successful


Batch 260 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 260 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 260 saved: 50/50 successful


Batch 261 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 261 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 261 saved: 50/50 successful


Batch 262 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 262 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 262 saved: 40/50 successful


Batch 263 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 263 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 263 saved: 49/50 successful


Batch 264 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 264 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 264 saved: 48/50 successful


Batch 265 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 265 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 265 saved: 48/50 successful


Batch 266 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 266 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 266 saved: 50/50 successful


Batch 267 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 267 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 267 saved: 50/50 successful


Batch 268 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 268 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 268 saved: 50/50 successful


Batch 269 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 269 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 269 saved: 50/50 successful


Batch 270 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 270 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 270 saved: 50/50 successful


Batch 271 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 271 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 271 saved: 50/50 successful


Batch 272 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 272 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 272 saved: 41/50 successful


Batch 273 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 273 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 273 saved: 50/50 successful


Batch 274 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 274 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 274 saved: 50/50 successful


Batch 275 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 275 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 275 saved: 50/50 successful


Batch 276 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 276 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 276 saved: 38/50 successful


Batch 277 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 277 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 277 saved: 50/50 successful


Batch 278 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 278 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 278 saved: 49/50 successful


Batch 279 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 279 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 279 saved: 50/50 successful


Batch 280 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 280 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 280 saved: 48/50 successful


Batch 281 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 281 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 281 saved: 50/50 successful


Batch 282 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 282 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 282 saved: 49/50 successful


Batch 283 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 283 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 283 saved: 50/50 successful


Batch 284 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 284 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 284 saved: 49/50 successful


Batch 285 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 285 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 285 saved: 50/50 successful


Batch 286 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 286 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 286 saved: 49/50 successful


Batch 287 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 287 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 287 saved: 49/50 successful


Batch 288 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 288 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 288 saved: 50/50 successful


Batch 289 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 289 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 289 saved: 47/50 successful


Batch 290 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 290 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 290 saved: 50/50 successful


Batch 291 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 291 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 291 saved: 48/50 successful


Batch 292 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 292 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 292 saved: 50/50 successful


Batch 293 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 293 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 293 saved: 48/50 successful


Batch 294 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 294 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 294 saved: 50/50 successful


Batch 295 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 295 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 295 saved: 50/50 successful


Batch 296 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 296 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 296 saved: 50/50 successful


Batch 297 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 297 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 297 saved: 50/50 successful


Batch 298 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 298 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 298 saved: 49/50 successful


Batch 299 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 299 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 299 saved: 48/50 successful


Batch 300 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 300 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 300 saved: 50/50 successful


Batch 301 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 301 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 301 saved: 48/50 successful


Batch 302 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 302 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 302 saved: 50/50 successful


Batch 303 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 303 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 303 saved: 50/50 successful


Batch 304 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 304 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 304 saved: 48/50 successful


Batch 305 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 305 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 305 saved: 50/50 successful


Batch 306 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 306 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 306 saved: 50/50 successful


Batch 307 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 307 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 307 saved: 50/50 successful


Batch 308 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 308 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 308 saved: 50/50 successful


Batch 309 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 309 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 309 saved: 46/50 successful


Batch 310 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 310 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 310 saved: 49/50 successful


Batch 311 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 311 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 311 saved: 50/50 successful


Batch 312 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 312 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 312 saved: 45/50 successful


Batch 313 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 313 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 313 saved: 49/50 successful


Batch 314 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 314 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 314 saved: 47/50 successful


Batch 315 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 315 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 315 saved: 46/50 successful


Batch 316 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 316 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 316 saved: 48/50 successful


Batch 317 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 317 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 317 saved: 44/50 successful


Batch 318 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 318 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 318 saved: 46/50 successful


Batch 319 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 319 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 319 saved: 35/50 successful


Batch 320 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 320 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 320 saved: 50/50 successful


Batch 321 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 321 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 321 saved: 50/50 successful


Batch 322 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 322 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 322 saved: 48/50 successful


Batch 323 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 323 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 323 saved: 50/50 successful


Batch 324 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 324 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 324 saved: 44/50 successful


Batch 325 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 325 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 325 saved: 48/50 successful


Batch 326 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 326 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 326 saved: 46/50 successful


Batch 327 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 327 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 327 saved: 50/50 successful


Batch 328 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 328 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 328 saved: 48/50 successful


Batch 329 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 329 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 329 saved: 50/50 successful


Batch 330 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 330 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 330 saved: 50/50 successful


Batch 331 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 331 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 331 saved: 49/50 successful


Batch 332 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 332 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 332 saved: 50/50 successful


Batch 333 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 333 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 333 saved: 49/50 successful


Batch 334 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 334 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 334 saved: 49/50 successful


Batch 335 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 335 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 335 saved: 39/50 successful


Batch 336 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 336 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 336 saved: 47/50 successful


Batch 337 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 337 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 337 saved: 50/50 successful


Batch 338 Downloads:   0%|          | 0/150 [00:00<?, ?file/s]

Batch 338 Processing:   0%|          | 0/50 [00:00<?, ?protein/s]

📝 Batch 338 saved: 47/50 successful


Batch 339 Downloads:   0%|          | 0/132 [00:00<?, ?file/s]

Batch 339 Processing:   0%|          | 0/44 [00:00<?, ?protein/s]

📝 Batch 339 saved: 44/44 successful

✅ Finished! Results saved to: C:\Users\rfrjo\Documents\Codebases\PFP_Testing\data\protein_matching_results_counts_updated.tsv


In [6]:
# ---- USER CONFIG ----------------------------------------------------------- 
TSV_PATH   = r"C:\Users\rfrjo\Documents\Codebases\PFP_Testing\data\openfold_results_rescued.tsv"
SAVE_ROOT  = r"C:\Users\rfrjo\Documents\Codebases\PFP_Testing\data\openfold_msas"
MAX_ROWS   = 256                        # Target number of sequences for ESM-MSA
TARGET_PROTEIN = "5O31-m"               # Specific protein to process
GENERATE_ESM_EMBED = False               # Whether to generate ESM embeddings or just MSA files
# ---------------------------------------------------------------------------
from scipy.spatial.distance import pdist, squareform
import csv, os, re, random, subprocess, textwrap
import boto3
from botocore.config import Config
from botocore import UNSIGNED
from pathlib import Path
import numpy as np
import tempfile

# ---------- helpers ---------------------------------------------------------
def parse_chain_id(s: str):
    s = s.strip()
    if '-' in s:
        pdb, chain = s.split('-')
    elif '_' in s:
        pdb, chain = s.split('_')
    elif re.fullmatch(r"[0-9][A-Za-z0-9]{3}[A-Za-z]", s):
        pdb, chain = s[:4], s[4]
    else:
        raise ValueError(f"Cannot parse chain ID from '{s}'")
    return pdb.upper(), chain.upper()

def to_wsl(p: Path) -> str:
    p = Path(p).resolve()
    posix = p.as_posix()
    if posix.startswith("/mnt/"):
        return posix
    return f"/mnt/{p.drive[0].lower()}{posix[2:]}"

def strip_insertions_a3m(seq: str) -> str:
    """
    Properly remove insertions from A3M format sequences.
    A3M format rules:
    - Uppercase: matches/substitutions (keep)
    - Lowercase: insertions (remove)
    - '-': deletions (keep)
    - '.': gaps aligned to insertions (remove - these are optional in A3M)
    - '*': stop codons (remove)
    """
    cleaned = ""
    for char in seq:
        if char.isupper() or char == '-':
            # Keep uppercase letters (matches) and dashes (deletions)
            cleaned += char
        elif char.islower() or char == '.' or char == '*':
            # Remove lowercase letters (insertions), dots (gaps to insertions), and asterisks (stop codons)
            continue
        else:
            # Handle any other characters (shouldn't occur in proper A3M)
            print(f"Warning: unexpected character '{char}' in sequence")
            continue
    return cleaned

def load_msa_from_a3m(a3m_path, max_sequences=None):
    """
    Load MSA from A3M file in the format expected by ESM-MSA.
    Returns list of (description, sequence) tuples.
    """
    msa_sequences = []
    with open(a3m_path) as f:
        description = None
        sequence = ""
        
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                # Save previous sequence if exists
                if description is not None:
                    msa_sequences.append((description, sequence))
                    if max_sequences and len(msa_sequences) >= max_sequences:
                        break
                # Start new sequence
                description = line[1:]  # Remove '>' prefix
                sequence = ""
            else:
                # Accumulate sequence lines
                sequence += line
        
        # Don't forget the last sequence
        if description is not None:
            msa_sequences.append((description, sequence))
    
    return msa_sequences

def diversity_maximizing_subsample(sequences, target_count):
    """
    Greedy Max-Hamming subsampling using SciPy for speed.
    • sequences: list[(header, seq)] where *all seqs already have identical length*
    • target_count: number of sequences to keep
    Returns a list in the same (header, seq) format.
    """
    n = len(sequences)
    if n <= target_count:
        return sequences                     # nothing to do
    
    # --- 1. Convert sequences to numeric array ---
    # Create a mapping for amino acids to numbers
    amino_acids = set()
    for _, seq in sequences:
        amino_acids.update(seq)
    
    # Create mapping dict
    aa_to_num = {aa: i for i, aa in enumerate(sorted(amino_acids))}
    
    # Convert sequences to numeric array
    seq_length = len(sequences[0][1])
    arr = np.zeros((n, seq_length), dtype=np.int8)
    
    for i, (_, seq) in enumerate(sequences):
        for j, aa in enumerate(seq):
            arr[i, j] = aa_to_num[aa]
    
    # --- 2. Pair-wise Hamming distances (normalized 0-1) ---
    # pdist returns a condensed 1-D vector; squareform gives NxN matrix
    dmat = squareform(pdist(arr, metric="hamming"))  # shape (N, N)
    
    # --- 3. Greedy selection ---
    selected_mask = np.zeros(n, dtype=bool)
    selected_mask[0] = True                  # keep the query
    selected = [0]
    
    while selected_mask.sum() < target_count:
        # mean distance to already-selected sequences
        avg = dmat[:, selected_mask].mean(axis=1)
        avg[selected_mask] = -1              # ignore already-picked rows
        idx = int(avg.argmax())
        if avg[idx] <= 0:                    # no further diversity gain
            break
        selected_mask[idx] = True
        selected.append(idx)
    
    return [sequences[i] for i in selected]

# ---------- sanity: hhfilter visible inside WSL -----------------------------
if subprocess.run(["wsl", "which", "hhfilter"], capture_output=True, text=True).returncode != 0:
    raise RuntimeError("WSL cannot find hhfilter – did you install hhsuite?")

# ---------- 1. Find specific protein in TSV ---------------------------------
target_found = False
with open(TSV_PATH) as fh:
    reader = csv.DictReader(fh, delimiter='\t')
    for row in reader:
        chain_raw = row["converted_id"] or row["matched_id"]
        try:
            pdb, chain = parse_chain_id(chain_raw)
            chain_formatted = f"{pdb}-{chain}"
            if chain_formatted == TARGET_PROTEIN:
                target_found = True
                CHAIN_RAW = chain_raw
                break
        except ValueError:
            continue

if not target_found:
    raise RuntimeError(f"Target protein {TARGET_PROTEIN} not found in TSV file")

pdb, chain = parse_chain_id(CHAIN_RAW)
CHAIN = f"{pdb}-{chain}"
print("Processing chain:", CHAIN)

# ---------- 2. Create output directory for all files ------------------------
final_dir = Path(SAVE_ROOT) / CHAIN
final_dir.mkdir(parents=True, exist_ok=True)

# ---------- 3. Download MSA files to permanent directory --------------------
prefix = f"pdb/{pdb.lower()}_{chain}/a3m/"
bucket = "openfold"
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

msa_files = []
for fname in ("bfd_uniclust_hits.a3m", "mgnify_hits.a3m", "uniref90_hits.a3m"):
    key = prefix + fname
    dest = final_dir / fname
    try:
        s3.download_file(bucket, key, str(dest))
        msa_files.append(dest)
        print(f"✓ downloaded {fname} to {dest}")
    except Exception:
        print(f"✗ missing {fname}")

if not msa_files:
    raise RuntimeError(f"No MSA files found for {CHAIN}")

# ---------- 4. Process MSA files in permanent directory ---------------------
raw_cat = final_dir / "all_raw.a3m"
filtered = final_dir / f"all_hhfiltered_{MAX_ROWS}.a3m"

# Concatenate MSA files
print(f"Concatenating {len(msa_files)} MSA files...")
with open(raw_cat, "w") as out:
    for f in msa_files:
        out.write(f.read_text())
print(f"✓ concatenated MSA saved to: {raw_cat}")

# Filter with hhfilter
print("Running hhfilter...")
subprocess.run(
    [
        "wsl", "hhfilter",
        "-i", to_wsl(raw_cat),
        "-o", to_wsl(filtered),
        "-diff", str(MAX_ROWS)
    ],
    check=True
)
print(f"✓ filtered MSA saved to: {filtered}")

# ---------- 5. Load and process MSA sequences ----------------------------
print("Loading MSA from hhfilter output...")

all_sequences = load_msa_from_a3m(filtered)
print(f"hhfilter returned {len(all_sequences)} sequences")

if not all_sequences:
    raise RuntimeError("No sequences found in filtered A3M file")

# Strip insertions and ensure equal length
print("Stripping insertions from A3M sequences...")
processed_sequences = []
query_seq_stripped = None

for i, (description, sequence) in enumerate(all_sequences):
    stripped_seq = strip_insertions_a3m(sequence)
    
    if i == 0:  # First sequence is the query
        query_seq_stripped = stripped_seq
        target_length = len(stripped_seq)
        print(f"Query sequence length after stripping insertions: {target_length}")
    
    # Only keep sequences that match the target length
    if len(stripped_seq) == target_length:
        processed_sequences.append((description, stripped_seq))
    else:
        print(f"Filtered out sequence {i} (length {len(stripped_seq)} vs {target_length})")

print(f"After length filtering: {len(processed_sequences)} sequences")

if len(processed_sequences) < 2:
    raise RuntimeError(f"Not enough valid sequences after filtering: {len(processed_sequences)}")

# Apply diversity maximizing strategy if needed
if len(processed_sequences) > MAX_ROWS:
    print(f"Applying diversity maximizing strategy to reduce from {len(processed_sequences)} to {MAX_ROWS} sequences")
    processed_sequences = diversity_maximizing_subsample(processed_sequences, MAX_ROWS)
    print(f"After diversity maximizing: {len(processed_sequences)} sequences")

print(f"Final MSA sequences: {len(processed_sequences)}")
print(f"Sequence length: {len(processed_sequences[0][1])}")

# ---------- 6. ESM-MSA inference and save ONLY MSA embeddings (CONDITIONAL) ---------------
if GENERATE_ESM_EMBED:
    try:
        import torch, esm
        
        # Load the MSA transformer model
        model, alphabet = esm.pretrained.esm_msa1b_t12_100M_UR50S()
        batch_converter = alphabet.get_batch_converter()
        model.eval()
        
        # Move model to GPU if available for faster inference
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)
        print(f"Using device: {device}")
        
        print("\nPreparing MSA for ESM-MSA...")
        
        # ESM-MSA format: list of (label, sequence) tuples
        msa_tuples = [(f"seq_{i}", seq) for i, (_, seq) in enumerate(processed_sequences)]
        
        print(f"MSA contains {len(msa_tuples)} sequences")
        
        # Verify all sequences have the same length
        lengths = [len(seq) for _, seq in msa_tuples]
        if len(set(lengths)) > 1:
            raise RuntimeError(f"Sequences still have different lengths: {set(lengths)}")
        
        print(f"✓ All sequences have length: {lengths[0]}")
        
        # Convert to ESM format
        labels, strs, tokens = batch_converter([msa_tuples])
        tokens = tokens.to(device)
        
        print("✓ ESM-MSA batch conversion successful!")
        print(f"  Tokens shape: {tokens.shape}")
        
        # Run inference
        print("\nRunning ESM-MSA inference...")
        with torch.no_grad():
            results = model(tokens, repr_layers=[12])
            
            # Extract ONLY the MSA embeddings
            msa_embeddings = results["representations"][12].cpu()  # Move to CPU immediately
            print(f"✓ MSA embeddings shape: {msa_embeddings.shape}")
        
        print("\n🎉 ESM-MSA processing completed successfully!")
        
        # Save ONLY the MSA embeddings tensor
        embeddings_path = final_dir / f"{pdb.lower()}_{chain}_msa_emb.pt"
        torch.save(msa_embeddings, embeddings_path)
        print(f"✓ ONLY MSA embeddings saved to: {embeddings_path}")
        
        print(f"\n🎯 COMPLETED: Saved only MSA embeddings to {embeddings_path}")
        
    except ImportError:
        print("ESM not installed – skipping inference demo.")
    except Exception as e:
        print(f"Error during ESM-MSA processing: {e}")
        import traceback
        traceback.print_exc()
        raise
else:
    print(f"\n🎯 COMPLETED: MSA processing finished. ESM embedding generation skipped (GENERATE_ESM_EMBED=False)")
    print(f"Processed {len(processed_sequences)} sequences of length {len(processed_sequences[0][1])}")

# ---------- 7. Summary of saved files ---------------------------------------
print(f"\n📁 Files saved to {final_dir}:")
for fname in ("bfd_uniclust_hits.a3m", "mgnify_hits.a3m", "uniref90_hits.a3m"):
    if (final_dir / fname).exists():
        print(f"  ✓ {fname}")
print(f"  ✓ all_raw.a3m (concatenated)")
print(f"  ✓ all_hhfiltered_{MAX_ROWS}.a3m (filtered)")
if GENERATE_ESM_EMBED:
    print(f"  ✓ {pdb.lower()}_{chain}_msa_emb.pt (embeddings)")

RuntimeError: Target protein 5O31-m not found in TSV file

In [16]:
# ╔═══════════════════════════════════════════════════════════════════════╗
# ║                          USER  CONFIG                                ║
# ╚═══════════════════════════════════════════════════════════════════════╝
TSV_PATH        = r"C:\Users\rfrjo\Documents\Codebases\PFP_Testing\openfold_results_rescued.tsv"
PROTEIN_ROOT    = r"C:\Users\rfrjo\Documents\Codebases\PFP_Testing\protein_data_pdb"
MAX_ROWS        = 256     # ≡ rows kept **inside a single MSA** (query + 255 homologs)
NUM_SAMPLES     = 256     # ≡ how many random proteins to process this run       ← NEW
SUMMARY_TSV_OUT = r"C:\Users\rfrjo\Documents\Codebases\PFP_Testing\msa_embedding_summary.tsv"
RANDOM_SEED     = 42

# ╔═══════════════════════════════════════════════════════════════════════╗
# ║                      IMPORTS &  SMALL HELPERS                         ║
# ╚═══════════════════════════════════════════════════════════════════════╝
import csv, random, subprocess, re, os, sys, json, gc, warnings
from pathlib import Path
from typing import List, Tuple
import boto3
from botocore.config import Config
from botocore import UNSIGNED
import numpy as np
from scipy.spatial.distance import pdist, squareform
import torch, esm
from tqdm.auto import tqdm

def parse_chain_id(s: str) -> Tuple[str, str]:
    s = s.strip()
    if '-' in s:
        pdb, chain = s.split('-')
    elif '_' in s:
        pdb, chain = s.split('_')
    elif re.fullmatch(r"[0-9][A-Za-z0-9]{3}[A-Za-z]", s):
        pdb, chain = s[:4], s[4]
    else:
        raise ValueError(f"Cannot parse chain ID from '{s}'")
    return pdb.upper(), chain.upper()

def to_wsl(p: Path) -> str:
    p = Path(p).resolve()
    posix = p.as_posix()
    if posix.startswith("/mnt/"):
        return posix
    return f"/mnt/{p.drive[0].lower()}{posix[2:]}"

def strip_insertions_a3m(seq: str) -> str:
    out = []
    for c in seq:
        if c.isupper() or c == '-': #TODO: CONFUSING DOCUMENTATION
            out.append(c)
        # lowercase / '.' / '*' are dropped
    return ''.join(out)

def load_a3m(fp: Path, max_seqs: int | None = None) -> List[Tuple[str,str]]:
    seqs, desc, seq = [], None, ""
    with open(fp) as fh:
        for ln in fh:
            ln = ln.rstrip()
            if ln.startswith(">"):
                if desc is not None:
                    seqs.append((desc, seq))
                    if max_seqs and len(seqs) >= max_seqs:
                        break
                desc, seq = ln[1:], ""
            else:
                seq += ln
        if desc is not None:
            seqs.append((desc, seq))
    return seqs

def div_max_subsample(seqs: List[Tuple[str,str]], k: int):
    """Greedy max-Hamming subsample (identical length assumption)."""
    n = len(seqs)
    if n <= k: return seqs
    aa = sorted({c for _,s in seqs for c in s})
    aa2i = {c:i for i,c in enumerate(aa)}
    L = len(seqs[0][1])
    arr = np.zeros((n, L), np.int8)
    for i, (_,s) in enumerate(seqs):
        arr[i] = [aa2i[c] for c in s]
    dmat = squareform(pdist(arr, 'hamming'))
    chosen = [0]; mask = np.zeros(n, bool); mask[0]=True
    while sum(mask) < k:
        mean_dist = dmat[:,mask].mean(1)
        mean_dist[mask] = -1
        nxt = int(mean_dist.argmax())
        if mean_dist[nxt] <= 0: break
        mask[nxt]=True; chosen.append(nxt)
    return [seqs[i] for i in chosen]

# make sure hhfilter exists
if subprocess.run(["wsl","which","hhfilter"],capture_output=True).returncode!=0:
    raise RuntimeError("hhfilter not found inside WSL - install hhsuite first")

# ╔═══════════════════════════════════════════════════════════════════════╗
# ║                    LOAD  TSV  &  SAMPLE  PROTEINS                     ║
# ╚═══════════════════════════════════════════════════════════════════════╝
with open(TSV_PATH) as fh:
    rows = list(csv.DictReader(fh, delimiter='\t'))

valid_rows = [r for r in rows if r['status'] != 'NOT_FOUND']
if len(valid_rows) < NUM_SAMPLES:
    raise RuntimeError(f"Only {len(valid_rows)} usable chains found, need ≥ {NUM_SAMPLES}")

random.seed(RANDOM_SEED)
sampled = random.sample(valid_rows, NUM_SAMPLES)


# ╔═══════════════════════════════════════════════════════════════════════╗
# ║                   PRE-LOAD  ESM-MSA MODEL ONCE                        ║
# ╚═══════════════════════════════════════════════════════════════════════╝
model, alphabet = esm.pretrained.esm_msa1b_t12_100M_UR50S()
batch_converter = alphabet.get_batch_converter()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device).eval()
print("ESM-MSA model loaded on", device)

# ╔═══════════════════════════════════════════════════════════════════════╗
# ║                         MAIN  LOOP                                    ║
# ╚═══════════════════════════════════════════════════════════════════════╝
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
bucket = "openfold"

summary_records = []
for r in tqdm(sampled, desc="Embedding MSAs", unit="chain"):
    original_id  = r["original_id"]
    converted_id = r["converted_id"] or r["matched_id"]
    pdb, chain   = parse_chain_id(converted_id)
    chain_tag    = f"{pdb}-{chain}"
    
    # ---------- directories ----------
    chain_dir = Path(PROTEIN_ROOT) / original_id   # append into existing dataset folder
    msa_dir   = chain_dir / "msa_raw"              # keep raw/filtered A3M separate
    msa_dir.mkdir(parents=True, exist_ok=True)

    # ---------- S3 download ----------
    prefix = f"pdb/{pdb.lower()}_{chain}/a3m/"
    msa_files = []
    for fname in ("bfd_uniclust_hits.a3m","mgnify_hits.a3m","uniref90_hits.a3m"):
        dest = msa_dir / fname
        if dest.exists():                           # reuse if already downloaded
            msa_files.append(dest); continue
        try:
            s3.download_file(bucket, prefix+fname, str(dest))
            msa_files.append(dest)
        except Exception:
            pass
    if not msa_files:
        warnings.warn(f"No MSA for {chain_tag}, skipping"); continue
    
    # ---------- concatenate ----------
    raw_cat = msa_dir / "all_raw.a3m"
    with open(raw_cat,"w") as out:
        for fp in msa_files: out.write(fp.read_text())
    
    # ---------- hhfilter ----------
    filt = msa_dir / f"all_hhfiltered_{MAX_ROWS}.a3m"
    subprocess.run(["wsl","hhfilter","-i",to_wsl(raw_cat),"-o",to_wsl(filt),"-diff",str(MAX_ROWS)],
                   check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    
    # ---------- load & clean ----------
    all_seqs = load_a3m(filt)
    len_hh   = len(all_seqs)
    stripped=[]
    tgt_len=None
    for i,(d,s) in enumerate(all_seqs):
        clean = strip_insertions_a3m(s)
        if i==0: tgt_len=len(clean)
        if len(clean)==tgt_len: stripped.append((d,clean))
    len_lenfilter = len(stripped)
    if len_lenfilter < 2: warnings.warn(f"{chain_tag} has <2 sequences, skipping"); continue
    
    # diversity
    final_seqs = div_max_subsample(stripped, MAX_ROWS) if len(stripped)>MAX_ROWS else stripped
    len_final  = len(final_seqs)
    
    # ---------- ESM-MSA embedding ----------
    msa_tuples = [(f"seq_{i}",seq) for i,(_,seq) in enumerate(final_seqs)]
    _,_,tokens = batch_converter([msa_tuples])
    tokens = tokens.to(device)
    with torch.no_grad():
        rep = model(tokens, repr_layers=[12])["representations"][12].cpu()  #  (1,N,L,Embedding)
    emb_path = chain_dir / f"{chain_tag}_msa_emb.pt"
    torch.save({"embeddings":rep.squeeze(0), "msa":final_seqs}, emb_path)
    
    # ---------- summary ----------
    summary_records.append(dict(
        original_id = original_id,
        converted_id= converted_id,
        sequence_length = tgt_len,
        hhfilter_returned = len_hh,
        after_lenfilter   = len_lenfilter,
        final_count       = len_final
    ))
    
    # memory hygiene
    del tokens, rep; gc.collect()

# ╔═══════════════════════════════════════════════════════════════════════╗
# ║                    WRITE  SUMMARY  TSV                                ║
# ╚═══════════════════════════════════════════════════════════════════════╝
with open(SUMMARY_TSV_OUT,"w", newline='') as fh:
    writer = csv.DictWriter(
        fh,
        fieldnames=["original_id","converted_id","sequence_length",
                    "hhfilter_returned","after_lenfilter","final_count"],
        delimiter='\t'
    )
    writer.writeheader()
    writer.writerows(summary_records)

print(f"\n✓ Finished. {NUM_SAMPLES} embeddings saved. Summary written to:\n  {SUMMARY_TSV_OUT}")


ESM-MSA model loaded on cpu


Embedding MSAs:   0%|          | 0/256 [00:00<?, ?chain/s]

KeyboardInterrupt: 

<h1>MSA ID DATA</h1>

In [31]:
from pathlib import Path
from Bio import pairwise2
import string
from concurrent.futures import ThreadPoolExecutor
from tqdm.auto import tqdm
import os

# Simple config
PROTEIN_DIR = Path(r"C:\Users\rfrjo\Documents\Codebases\PFP_Testing\data\protein_data_pdb")
SKIP_IDS = {"2WWX-B", "3M7G-A", "5GAO-E", "5OXE-A", "1W8X-M"}
MAX_WORKERS = min(32, (os.cpu_count() or 4) * 2)

# Simple sequence cleaner
LOWER = string.ascii_lowercase
clean_trans = str.maketrans('', '', LOWER + "-.")

def clean_seq(seq):
    return seq.translate(clean_trans).upper()

def get_query_from_a3m(a3m_file):
    """Get the first sequence from A3M (should be the query)"""
    with open(a3m_file) as f:
        seq_lines = []
        in_first_seq = False
        
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if in_first_seq:  # Hit second header, stop
                    break
                in_first_seq = True
            elif in_first_seq:
                seq_lines.append(line)
        
        return ''.join(seq_lines)

def calculate_identity(seq1, seq2):
    if not seq1 or not seq2:
        return 0.0
    score = pairwise2.align.globalxx(seq1, seq2, score_only=True)
    return score / max(len(seq1), len(seq2))

def process_protein(protein_dir):
    """Process a single protein directory"""
    protein_id = protein_dir.name
    
    try:
        # Get files
        a3m_file = protein_dir / "final_filtered_256_stripped.a3m"
        seq_file = protein_dir / "sequence.txt"
        
        if not a3m_file.exists() or not seq_file.exists():
            return (protein_id, 0.0, "MISSING_FILES", "", "")
        
        # Get sequences
        query_raw = get_query_from_a3m(a3m_file)
        ref_seq = seq_file.read_text().strip().upper()
        
        # Clean and compare
        query_clean = clean_seq(query_raw)
        identity = calculate_identity(query_clean, ref_seq)
        
        status = "PASS" if identity >= 0.95 else "FAIL"
        return (protein_id, identity, status, query_clean, ref_seq)
        
    except Exception as e:
        return (protein_id, 0.0, f"ERROR: {str(e)[:50]}", "", "")

# Get all protein directories
all_dirs = [d for d in PROTEIN_DIR.iterdir() if d.is_dir() and d.name not in SKIP_IDS]

print(f"Processing {len(all_dirs):,} proteins with {MAX_WORKERS} threads...")

# Process with multithreading and progress bar
results = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    for result in tqdm(executor.map(process_protein, all_dirs), 
                       total=len(all_dirs), 
                       desc="Processing proteins",
                       unit="protein"):
        results.append(result)

# Summary
passes = sum(1 for r in results if r[2] == "PASS")
fails = sum(1 for r in results if r[2] == "FAIL")
errors = sum(1 for r in results if r[2].startswith("ERROR") or r[2] == "MISSING_FILES")

print(f"\n{'='*60}")
print(f"SUMMARY:")
print(f"{'='*60}")
print(f"PASS:         {passes:,}")
print(f"FAIL:         {fails:,}")
print(f"ERROR:        {errors:,}")
print(f"TOTAL:        {len(results):,}")
print(f"Success rate: {passes/len(results)*100:.1f}%")

# Show detailed failures
failures = [r for r in results if r[2] == "FAIL"]
if failures:
    print(f"\n{'='*80}")
    print(f"DETAILED FAILURE ANALYSIS")
    print(f"{'='*80}")
    
    # Sort by similarity (lowest first)
    failures_sorted = sorted(failures, key=lambda x: x[1])
    
    for i, (protein_id, identity, status, query_seq, ref_seq) in enumerate(failures_sorted):
        print(f"\n{'─'*60}")
        print(f"FAILURE #{i+1}: {protein_id}")
        print(f"Similarity: {identity*100:.2f}%")
        print(f"{'─'*60}")
        
        print(f"\nA3M QUERY SEQUENCE (length: {len(query_seq)}):")
        print(f"{query_seq}")
        
        print(f"\nREFERENCE SEQUENCE.TXT (length: {len(ref_seq)}):")
        print(f"{ref_seq}")
        
        # Show length comparison
        print(f"\nLENGTH COMPARISON:")
        print(f"  Query:     {len(query_seq)}")
        print(f"  Reference: {len(ref_seq)}")
        print(f"  Match:     {len(query_seq) == len(ref_seq)}")
        
        # If same length, show first few differences
        if len(query_seq) == len(ref_seq) and query_seq != ref_seq:
            diffs = [(i, a, b) for i, (a, b) in enumerate(zip(query_seq, ref_seq)) if a != b]
            print(f"\nFIRST 5 DIFFERENCES:")
            for j, (pos, q_char, r_char) in enumerate(diffs[:5]):
                print(f"  Position {pos}: Query='{q_char}' vs Ref='{r_char}'")
            print(f"  Total differences: {len(diffs)}")
        
        print(f"\n{'─'*60}")
        
        # Stop after showing 10 failures to avoid overwhelming output
        if i >= 9:
            remaining = len(failures_sorted) - 10
            if remaining > 0:
                print(f"\n... and {remaining} more failures (truncated for readability)")
            break

# Show errors if any
errors_list = [r for r in results if r[2].startswith("ERROR") or r[2] == "MISSING_FILES"]
if errors_list:
    print(f"\n{'='*60}")
    print(f"ERRORS:")
    print(f"{'='*60}")
    for pid, ident, status, _, _ in errors_list[:10]:
        print(f"   {pid:<12} {status}")
    if len(errors_list) > 10:
        print(f"   ... and {len(errors_list) - 10} more errors")

print(f"\n✅ Done!")

Processing 29,735 proteins with 32 threads...


Processing proteins:   0%|          | 0/29735 [00:00<?, ?protein/s]


SUMMARY:
PASS:         29,503
FAIL:         232
ERROR:        0
TOTAL:        29,735
Success rate: 99.2%

DETAILED FAILURE ANALYSIS

────────────────────────────────────────────────────────────
FAILURE #1: 4V3P-Ll
Similarity: 0.00%
────────────────────────────────────────────────────────────

A3M QUERY SEQUENCE (length: 182):
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

REFERENCE SEQUENCE.TXT (length: 94):
MGKGTGSFGKRRNKTHTLCVRCGRRSFHLQKSTCSSCGYPAARIRKYNWSVKAIRRKTTGTGRMRYMRHVPRRFKSNFREGTEATPRKRAAAAN

LENGTH COMPARISON:
  Query:     182
  Reference: 94
  Match:     False

────────────────────────────────────────────────────────────

────────────────────────────────────────────────────────────
FAILURE #2: 5OQL-z
Similarity: 5.76%
────────────────────────────────────────────────────────────

A3M QUERY SEQUENCE (length: 1163):
MDSQQHKPHRPSKTKEKKKKQNSGG

In [34]:
from pathlib import Path
from Bio import pairwise2
import string
from concurrent.futures import ThreadPoolExecutor
from tqdm.auto import tqdm
import os
import pandas as pd

# Simple config
PROTEIN_DIR = Path(r"C:\Users\rfrjo\Documents\Codebases\PFP_Testing\data\protein_data_pdb")
TSV_FILE = Path(r"C:\Users\rfrjo\Documents\Codebases\PFP_Testing\data\protein_matching_results_counts_updated.tsv")
SKIP_IDS = {"2WWX-B", "3M7G-A", "5GAO-E", "5OXE-A", "1W8X-M"}
MAX_WORKERS = min(32, (os.cpu_count() or 4) * 2)

# Simple sequence cleaner
LOWER = string.ascii_lowercase
clean_trans = str.maketrans('', '', LOWER + "-.")

def clean_seq(seq):
    return seq.translate(clean_trans).upper()

def get_query_from_a3m(a3m_file):
    """Get the first sequence from A3M (should be the query)"""
    with open(a3m_file) as f:
        seq_lines = []
        in_first_seq = False
        
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if in_first_seq:  # Hit second header, stop
                    break
                in_first_seq = True
            elif in_first_seq:
                seq_lines.append(line)
        
        return ''.join(seq_lines)

def calculate_identity(seq1, seq2):
    if not seq1 or not seq2:
        return 0.0
    score = pairwise2.align.globalxx(seq1, seq2, score_only=True)
    return score / max(len(seq1), len(seq2))

def process_protein(protein_dir):
    """Process a single protein directory"""
    protein_id = protein_dir.name
    
    try:
        # Get files
        a3m_file = protein_dir / "final_filtered_256_stripped.a3m"
        seq_file = protein_dir / "sequence.txt"
        
        if not a3m_file.exists() or not seq_file.exists():
            return (protein_id, 0.0, "MISSING_FILES", "", "")
        
        # Get sequences
        query_raw = get_query_from_a3m(a3m_file)
        ref_seq = seq_file.read_text().strip().upper()
        
        # Clean and compare
        query_clean = clean_seq(query_raw)
        identity = calculate_identity(query_clean, ref_seq)
        
        status = "PASS" if identity >= 0.95 else "FAIL"
        return (protein_id, identity, status, query_clean, ref_seq)
        
    except Exception as e:
        return (protein_id, 0.0, f"ERROR: {str(e)[:50]}", "", "")

# Get all protein directories
all_dirs = [d for d in PROTEIN_DIR.iterdir() if d.is_dir() and d.name not in SKIP_IDS]

print(f"Processing {len(all_dirs):,} proteins with {MAX_WORKERS} threads...")

# Process with multithreading and progress bar
results = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    for result in tqdm(executor.map(process_protein, all_dirs), 
                       total=len(all_dirs), 
                       desc="Processing proteins",
                       unit="protein"):
        results.append(result)

# Summary
passes = sum(1 for r in results if r[2] == "PASS")
fails = sum(1 for r in results if r[2] == "FAIL")
errors = sum(1 for r in results if r[2].startswith("ERROR") or r[2] == "MISSING_FILES")

print(f"\n{'='*60}")
print(f"SUMMARY:")
print(f"{'='*60}")
print(f"PASS:         {passes:,}")
print(f"FAIL:         {fails:,}")
print(f"ERROR:        {errors:,}")
print(f"TOTAL:        {len(results):,}")
print(f"Success rate: {passes/len(results)*100:.1f}%")

# Get all failed protein IDs
failed_proteins = [r[0] for r in results if r[2] == "FAIL"]

if failed_proteins:
    print(f"\n{'='*80}")
    print(f"ALL FAILED PROTEIN IDs ({len(failed_proteins)} total):")
    print(f"{'='*80}")
    
    # Print ALL failed IDs without truncation
    for i, protein_id in enumerate(failed_proteins, 1):
        print(f"{i:3d}. {protein_id}")
    
    print(f"\n{'='*80}")
    print(f"UPDATING TSV FILE STATUS TO 'WARNING' FOR FAILED PROTEINS")
    print(f"{'='*80}")
    
    # Load TSV file
    print(f"📖 Loading TSV file: {TSV_FILE}")
    try:
        df = pd.read_csv(TSV_FILE, sep='\t')
        print(f"   ✓ Loaded {len(df):,} rows")
        
        # Show current status distribution
        status_counts = df['status'].value_counts()
        print(f"   Current status distribution:")
        for status, count in status_counts.items():
            print(f"     {status}: {count:,}")
        
        # Update status for failed proteins
        failed_set = set(failed_proteins)
        mask = df['original_id'].isin(failed_set)
        matched_count = mask.sum()
        
        print(f"\n🔄 Updating status for {matched_count:,} proteins...")
        df.loc[mask, 'status'] = 'WARNING'
        
        # Save updated TSV
        backup_file = TSV_FILE.with_suffix('.tsv.backup')
        if not backup_file.exists():
            df_original = pd.read_csv(TSV_FILE, sep='\t')
            df_original.to_csv(backup_file, sep='\t', index=False)
            print(f"   ✓ Created backup: {backup_file}")
        
        df.to_csv(TSV_FILE, sep='\t', index=False)
        print(f"   ✓ Updated TSV file saved: {TSV_FILE}")
        
        # Show new status distribution
        new_status_counts = df['status'].value_counts()
        print(f"\n   New status distribution:")
        for status, count in new_status_counts.items():
            print(f"     {status}: {count:,}")
            
        print(f"\n✅ Successfully flagged {matched_count:,} proteins with WARNING status!")
        
    except Exception as e:
        print(f"❌ Error updating TSV file: {e}")

else:
    print(f"\n✅ No failed proteins to update!")

print(f"\n✅ Done!")

Processing 29,735 proteins with 32 threads...


Processing proteins:   0%|          | 0/29735 [00:00<?, ?protein/s]


SUMMARY:
PASS:         29,503
FAIL:         232
ERROR:        0
TOTAL:        29,735
Success rate: 99.2%

ALL FAILED PROTEIN IDs (232 total):
  1. 3J6D-a
  2. 3J79-d
  3. 3J79-f
  4. 3J79-i
  5. 3J7P-k
  6. 3J7Q-b
  7. 3J7R-j
  8. 3J7R-Sf
  9. 3J7Y-f
 10. 3J7Y-h
 11. 3J81-a
 12. 3J81-e
 13. 3J81-f
 14. 3J92-h
 15. 3J9M-d
 16. 3J9M-j
 17. 3J9M-r
 18. 3JAH-b
 19. 3JAH-cc
 20. 3JAJ-d
 21. 3JAP-g
 22. 3JB9-a
 23. 3JB9-d
 24. 3JB9-e
 25. 3JB9-i
 26. 3JB9-k
 27. 3JBP-Ad
 28. 3JBP-Ah
 29. 3JCS-b
 30. 3JCS-e
 31. 3JCS-g
 32. 3JCS-h
 33. 3JCS-j
 34. 3JCS-n
 35. 3JCT-w
 36. 3JD5-b
 37. 3JD5-c
 38. 3JD5-e
 39. 3JD5-f
 40. 3JD5-g
 41. 3JD5-i
 42. 3JD5-j
 43. 3JD5-n
 44. 4CE4-i
 45. 4D5Y-e
 46. 4UER-c
 47. 4UG0-Lm
 48. 4UJC-Bu
 49. 4V3P-Lg
 50. 4V3P-Lj
 51. 4V3P-Lk
 52. 4V3P-Ll
 53. 4V3P-Ln
 54. 4V4N-Aa
 55. 4V4N-Ab
 56. 4V4N-Ad
 57. 4V4N-Ai
 58. 4V6U-Be
 59. 4V6W-Aa
 60. 4V6W-Ac
 61. 4V6W-Ag
 62. 4V6W-Ah
 63. 4V6W-Az
 64. 4V6W-Ce
 65. 4V6W-Cz
 66. 4V7E-Ca
 67. 4V7E-Ce
 68. 4V7E-Ci
 69. 4V7E-Co
 7

In [35]:
import pandas as pd

# List of all 232 failed protein IDs
failed_protein_ids = [
    "3J6D-a", "3J79-d", "3J79-f", "3J79-i", "3J7P-k", "3J7Q-b", "3J7R-j", "3J7R-Sf", "3J7Y-f", "3J7Y-h",
    "3J81-a", "3J81-e", "3J81-f", "3J92-h", "3J9M-d", "3J9M-j", "3J9M-r", "3JAH-b", "3JAH-cc", "3JAJ-d",
    "3JAP-g", "3JB9-a", "3JB9-d", "3JB9-e", "3JB9-i", "3JB9-k", "3JBP-Ad", "3JBP-Ah", "3JCS-b", "3JCS-e",
    "3JCS-g", "3JCS-h", "3JCS-j", "3JCS-n", "3JCT-w", "3JD5-b", "3JD5-c", "3JD5-e", "3JD5-f", "3JD5-g",
    "3JD5-i", "3JD5-j", "3JD5-n", "4CE4-i", "4D5Y-e", "4UER-c", "4UG0-Lm", "4UJC-Bu", "4V3P-Lg", "4V3P-Lj",
    "4V3P-Lk", "4V3P-Ll", "4V3P-Ln", "4V4N-Aa", "4V4N-Ab", "4V4N-Ad", "4V4N-Ai", "4V6U-Be", "4V6W-Aa", "4V6W-Ac",
    "4V6W-Ag", "4V6W-Ah", "4V6W-Az", "4V6W-Ce", "4V6W-Cz", "4V7E-Ca", "4V7E-Ce", "4V7E-Ci", "4V7E-Co", "4V7E-Cp",
    "4V7E-Cq", "4V7E-Cr", "4V7E-Cs", "4V8M-Bj", "4V8M-Bk", "4V8M-Br", "4V8M-Bu", "4V8M-Bv", "4V8M-Bw", "4V8M-Bx",
    "4V92-Bf", "5AJ3-c", "5AJ3-o", "5AJ4-Ab", "5AJ4-Aj", "5GAF-i", "5GM6-a", "5GM6-v", "5GUP-J", "5GUP-k",
    "5IT7-a", "5IT7-aa", "5IT7-gg", "5IT7-hh", "5IT7-ii", "5IT7-oo", "5IT7-pp", "5IT9-f", "5JPQ-y", "5K0Y-Y",
    "5LC5-b", "5LC5-c", "5LC5-g", "5LC5-m", "5LDW-a", "5LDW-n", "5LDX-d", "5LDX-e", "5LI0-s", "5LJ3-g",
    "5LNK-a", "5LNK-h", "5LNK-k", "5LNK-n", "5LNK-w", "5LZV-cc", "5LZV-i", "5LZV-p", "5LZY-j", "5NJT-b",
    "5NJT-e", "5O31-d", "5O31-q", "5OOL-e", "5OOL-q", "5OOM-k", "5OOM-o", "5OPT-l", "5OPT-m", "5OPT-o",
    "5OPT-r", "5OPT-t", "5OQL-a", "5OQL-c", "5OQL-d", "5OQL-e", "5OQL-g", "5OQL-m", "5OQL-n", "5OQL-o",
    "5OQL-p", "5OQL-r", "5OQL-t", "5OQL-v", "5OQL-z", "5T2A-i", "5T2A-v", "5T5H-l", "5T5H-o", "5T5H-r",
    "5T5H-t", "5T5H-u", "5T5H-v", "5T5H-w", "5TRE-a", "5VF3-a", "5VFT-e", "5VHF-d", "5VK2-a", "5XTD-s",
    "5XTD-w", "5XXB-c", "5XXB-e", "5XXB-g", "5XXB-h", "5XXB-i", "5XXB-j", "5XXB-l", "5XXB-p", "5XXU-a",
    "5XY3-a", "5XY3-e", "5XY3-g", "5XY3-i", "5XY3-m", "5XYI-b", "5XYI-c", "5XYI-e", "5Y6P-a8", "5YZG-y",
    "5ZEB-2", "5ZEB-6", "6AZ3-c", "6AZ3-d", "6AZ3-e", "6AZ3-h", "6D9J-g", "6DZI-k", "6DZI-r", "6DZI-y",
    "6EK0-Lc", "6ERI-Ac", "6ERI-Az", "6FKH-b", "6G2J-a", "6G2J-d", "6G2J-e", "6G2J-k", "6G2J-n", "6G72-c",
    "6G72-h", "6G72-i", "6G72-j", "6G72-o", "6G72-p", "6G72-r", "6GAZ-Af", "6GCS-d", "6GCS-f", "6GCS-h",
    "6GZ4-Ac", "6HCJ-d3", "6HCJ-m3", "6HCQ-b3", "6HCQ-o3", "6HIV-Ap", "6HIV-Bc", "6HIV-Bf", "6HIV-Cn",
    "6HIW-Cp", "6HIX-Af", "6HIX-Aj", "6HIX-Av", "6HIX-Bg", "6HIY-Cq", "6HIY-Cr", "6MTE-cc", "6NEQ-k",
    "6NEQ-o", "6NEQ-p", "6O7X-a", "6QDV-f"
]

print(f"🔍 Checking {len(failed_protein_ids)} failed protein IDs in TSV file...")
print("=" * 80)

# Load the TSV file
TSV_FILE = r"C:\Users\rfrjo\Documents\Codebases\PFP_Testing\data\protein_matching_results_counts_updated.tsv"
df = pd.read_csv(TSV_FILE, sep='\t')

print(f"📁 Loaded TSV with {len(df):,} rows")
print(f"📊 Columns: {list(df.columns)}")
print()

# Check which failed IDs exist in the TSV
found_ids = []
not_found_ids = []

for protein_id in failed_protein_ids:
    if protein_id in df['original_id'].values:
        found_ids.append(protein_id)
    else:
        not_found_ids.append(protein_id)

print(f"✅ FOUND in TSV: {len(found_ids)} out of {len(failed_protein_ids)}")
print(f"❌ NOT FOUND in TSV: {len(not_found_ids)} out of {len(failed_protein_ids)}")
print()

if found_ids:
    print("🎯 PROTEINS FOUND IN TSV (to be flagged as WARNING):")
    print("-" * 60)
    for i, protein_id in enumerate(found_ids, 1):
        print(f"{i:3d}. {protein_id}")
    print()

if not_found_ids:
    print("❓ PROTEINS NOT FOUND IN TSV:")
    print("-" * 40)
    for i, protein_id in enumerate(not_found_ids, 1):
        print(f"{i:3d}. {protein_id}")
    print()

# Now update the status for found proteins
if found_ids:
    print("🔄 Updating status to 'WARNING' for found proteins...")
    
    # Update status for found proteins
    df.loc[df['original_id'].isin(found_ids), 'status'] = 'WARNING'
    
    # Save the updated TSV
    df.to_csv(TSV_FILE, sep='\t', index=False)
    
    print(f"💾 Updated {len(found_ids)} proteins to 'WARNING' status")
    print(f"📁 Saved updated TSV file: {TSV_FILE}")
    
    # Show summary of what was changed
    warning_count = len(df[df['status'] == 'WARNING'])
    success_count = len(df[df['status'] == 'Success'])
    
    print()
    print("📈 FINAL STATUS SUMMARY:")
    print(f"   ✅ Success: {success_count:,}")
    print(f"   ⚠️  Warning: {warning_count:,}")
    print(f"   📊 Total: {len(df):,}")

else:
    print("⚠️  No proteins found in TSV to update!")

🔍 Checking 232 failed protein IDs in TSV file...
📁 Loaded TSV with 29,740 rows
📊 Columns: ['original_id', 'matched_id', 'similarity_score', 'notes', 'n_a3m_files', 'n_rows_dropped', 'hhfilter_rows', 'diversity_filtered_rows', 'status']

✅ FOUND in TSV: 232 out of 232
❌ NOT FOUND in TSV: 0 out of 232

------------------------------------------------------------
  1. 3J6D-a
  2. 3J79-d
  3. 3J79-f
  4. 3J79-i
  5. 3J7P-k
  6. 3J7Q-b
  7. 3J7R-j
  8. 3J7R-Sf
  9. 3J7Y-f
 10. 3J7Y-h
 11. 3J81-a
 12. 3J81-e
 13. 3J81-f
 14. 3J92-h
 15. 3J9M-d
 16. 3J9M-j
 17. 3J9M-r
 18. 3JAH-b
 19. 3JAH-cc
 20. 3JAJ-d
 21. 3JAP-g
 22. 3JB9-a
 23. 3JB9-d
 24. 3JB9-e
 25. 3JB9-i
 26. 3JB9-k
 27. 3JBP-Ad
 28. 3JBP-Ah
 29. 3JCS-b
 30. 3JCS-e
 31. 3JCS-g
 32. 3JCS-h
 33. 3JCS-j
 34. 3JCS-n
 35. 3JCT-w
 36. 3JD5-b
 37. 3JD5-c
 38. 3JD5-e
 39. 3JD5-f
 40. 3JD5-g
 41. 3JD5-i
 42. 3JD5-j
 43. 3JD5-n
 44. 4CE4-i
 45. 4D5Y-e
 46. 4UER-c
 47. 4UG0-Lm
 48. 4UJC-Bu
 49. 4V3P-Lg
 50. 4V3P-Lj
 51. 4V3P-Lk
 52. 4V3P-Ll
 53

In [8]:
# %%time
"""
One-pass S3 scan:
  • Counts how many .a3m files each chain has
  • Saves the first .a3m key per chain
  • Downloads only the query sequence (2–4 kB slice, fallback to full if needed)
Outputs:
  1) openfold_pdb_a3m_counts.tsv         (pid, n_files)
  2) openfold_pdb_query_sequences.fasta  (FASTA)
Tune CHAIN_LIMIT to test on a subset first.
"""

# ─── user parameters ────────────────────────────────────────────────────
CHAIN_LIMIT   = None          # 100, 1000, or None for all chains
SLICE_BYTES   = 4096         # safe for ≳95 % of sequences
MAX_THREADS   = 96

COUNTS_TSV    = "openfold_pdb_a3m_counts.tsv"
FASTA_OUT     = "openfold_pdb_query_sequences.fasta"

# ─── imports ────────────────────────────────────────────────────────────
from pathlib import Path
import csv
import concurrent.futures as cf
from collections import defaultdict
from tqdm.auto import tqdm
import boto3, botocore
from botocore import UNSIGNED
from botocore.client import Config

# S3 constants
BUCKET      = "openfold"
REGION      = "us-east-1"
PREFIX      = "pdb/"      # NOTE: no Delimiter for full listing

# ─── S3 client helpers ──────────────────────────────────────────────────
session = boto3.session.Session()
def new_s3():
    return session.client(
        "s3",
        region_name=REGION,
        config=Config(signature_version=UNSIGNED,
                      max_pool_connections=MAX_THREADS),
    )

# ─── 1. Single-pass listing of *all* objects under pdb/ ────────────────
print("▶ Listing every .a3m object …")

chain_counts   = defaultdict(int)   # cid → n_files
chain_firstkey = {}                 # cid → first .a3m key (for query fetch)

s3 = new_s3()
paginator = s3.get_paginator("list_objects_v2")

for page in tqdm(
    paginator.paginate(Bucket=BUCKET, Prefix=PREFIX),
    unit="pages",
):
    for obj in page.get("Contents", []):
        key = obj["Key"]
        if not key.endswith(".a3m"):          # skip non-alignment files
            continue

        # key looks like  pdb/1a0c_A/a3m/blast30.a3m
        cid = key.split("/")[1]               # 1a0c_A
        if CHAIN_LIMIT and cid not in chain_firstkey and len(chain_firstkey) >= CHAIN_LIMIT:
            continue        # respect user subset

        chain_counts[cid] += 1
        chain_firstkey.setdefault(cid, key)   # keep first encounter

    # early-exit once we met CHAIN_LIMIT unique chains
    if CHAIN_LIMIT and len(chain_firstkey) >= CHAIN_LIMIT:
        break

print(f"   → {len(chain_firstkey):,} chains queued (subset={bool(CHAIN_LIMIT)})")

# ─── 2. Download query sequences in parallel ───────────────────────────
def grab_query(cid_key) -> tuple[str, str]:
    cid, key = cid_key
    s3 = new_s3()
    try:
        raw = s3.get_object(Bucket=BUCKET, Key=key,
                            Range=f"bytes=0-{SLICE_BYTES-1}")["Body"].read()
        lines = raw.decode("utf-8", "ignore").splitlines()
        if len(lines) < 2 or ">" in lines[1]:         # slice clipped header
            raw = s3.get_object(Bucket=BUCKET, Key=key)["Body"].read()
            lines = raw.decode("utf-8", "ignore").splitlines()
    except botocore.exceptions.ClientError:
        return cid, ""                                # broken object → skip

    if len(lines) < 2:
        return cid, ""
    qseq = "".join(c for c in lines[1] if c.isupper() and c != "-")
    return cid, qseq

print("▶ Fetching query sequences …")
results = {}           # cid → query sequence
with cf.ThreadPoolExecutor(MAX_THREADS) as ex:
    for cid, qseq in tqdm(
        ex.map(grab_query, chain_firstkey.items()),
        total=len(chain_firstkey),
        unit="chains",
    ):
        results[cid] = qseq

# ─── 3. Write outputs ──────────────────────────────────────────────────
print("▶ Writing TSV & FASTA …")

with open(COUNTS_TSV, "w", newline="") as fh:
    w = csv.writer(fh, delimiter="\t")
    w.writerow(["pid", "n_a3m_files"])
    for cid in sorted(results):
        w.writerow([cid, chain_counts[cid]])

with open(FASTA_OUT, "w") as fh:
    for cid in sorted(results):
        if results[cid]:
            fh.write(f">{cid}\n{results[cid]}\n")

print(f"   ✓ {COUNTS_TSV}  ({Path(COUNTS_TSV).stat().st_size/1e3:.1f} kB)")
print(f"   ✓ {FASTA_OUT}     ({Path(FASTA_OUT).stat().st_size/1e3:.1f} kB)")
print("\n✅ Done.")


▶ Listing every .a3m object …


0pages [00:00, ?pages/s]

   → 131,487 chains queued (subset=False)
▶ Fetching query sequences …


  0%|          | 0/131487 [00:00<?, ?chains/s]

▶ Writing TSV & FASTA …
   ✓ openfold_pdb_a3m_counts.tsv  (1319.8 kB)
   ✓ openfold_pdb_query_sequences.fasta     (36320.3 kB)

✅ Done.


In [2]:
# %%time
"""
OPTIMIZED Hierarchical Protein Sequence Matching Pipeline
========================================================
Same exact logic as original, but with performance optimizations:
- Stage 2: O(1) lookup via sequence index instead of O(n) linear search
- Stage 3: Multithreaded similarity calculations
- Optional: Faster similarity algorithm (parasail) with fallback to Biopython

Stage 1: Exact PID matching (format: first 4 chars lowercase + chain uppercase)
Stage 2: Exact amino acid sequence matching  
Stage 3: Similarity-based matching with length prefiltering (±15 AA)

Outputs: protein_matching_results.tsv (original_id, matched_id, similarity_score)
"""

# ─── Configuration ──────────────────────────────────────────────────────
PROTEIN_DATA_DIR = r"C:\Users\rfrjo\Documents\Codebases\PFP_Testing\data\protein_data_pdb"
FASTA_FILE = r"C:\Users\rfrjo\Documents\Codebases\PFP_Testing\notebooks\openfold_pdb_query_sequences.fasta"
OUTPUT_TSV = "protein_matching_results.tsv"

LENGTH_TOLERANCE = 15  # ±15 AA for similarity matching prefilter
SIMILARITY_THRESHOLD = 0.0  # minimum similarity score to consider
MAX_THREADS = 8  # Number of threads for Stage 3 similarity calculations

# ─── Imports ────────────────────────────────────────────────────────────
import os
import csv
from pathlib import Path
from collections import defaultdict
from tqdm.auto import tqdm
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

# Try fast similarity algorithm first, fallback to Biopython
FORCE_BIOPYTHON = True  # Set to False once parasail issues are resolved

if not FORCE_BIOPYTHON:
    try:
        import parasail
        def calculate_similarity_fast(seq1, seq2):
            """Fast similarity using parasail C library - FIXED VERSION"""
            if not seq1 or not seq2:
                return 0.0
            if len(seq1) == 0 and len(seq2) == 0:
                return 1.0
            
            try:
                # Method 1: Try nw_stats (includes match counts)
                result = parasail.nw_stats(seq1, seq2, 10, 1, parasail.blosum62)
                if hasattr(result, 'matches'):
                    return result.matches / max(len(seq1), len(seq2))
                
                # Method 2: Fallback to score-based similarity
                alignment_score = parasail.nw(seq1, seq2, 10, 1, parasail.blosum62).score
                max_possible_score = max(len(seq1), len(seq2)) * 4  # Rough estimate
                return min(1.0, alignment_score / max_possible_score)
                
            except Exception as e:
                print(f"⚠ Parasail error, falling back to Biopython: {e}")
                # Emergency fallback to Biopython for this calculation
                from Bio import pairwise2
                alignments = pairwise2.align.globalxx(seq1, seq2, score_only=True)
                return alignments / max(len(seq1), len(seq2))
        
        calculate_similarity = calculate_similarity_fast
        print("✓ Using parasail for fast similarity calculations (FIXED)")
        
    except ImportError:
        FORCE_BIOPYTHON = True

if FORCE_BIOPYTHON:
    # Reliable Biopython version (your original working version)
    try:
        from Bio import pairwise2
        def calculate_similarity_bio(seq1, seq2):
            """Similarity using Biopython (slower but reliable)"""
            if not seq1 or not seq2:
                return 0.0
            if len(seq1) == 0 and len(seq2) == 0:
                return 1.0
            
            try:
                alignments = pairwise2.align.globalxx(seq1, seq2, score_only=True)
                max_len = max(len(seq1), len(seq2))
                if max_len == 0:  # Extra safety check
                    return 0.0
                
                result = alignments / max_len
                
                # DIAGNOSTIC: Validate result
                if not isinstance(result, (int, float)):
                    print(f"🔍 WARNING: Biopython returned non-numeric result: {type(result)} = {result}")
                    return 0.0
                    
                return result
            except Exception as e:
                print(f"⚠ Error calculating similarity: {e}")
                return 0.0
        
        calculate_similarity = calculate_similarity_bio
        print("✓ Using Biopython for similarity calculations (RELIABLE MODE)")
        
    except ImportError:
        print("❌ Biopython not available!")
        print("   Install with: pip install biopython")
        exit(1)

# ─── Helper Functions ───────────────────────────────────────────────────

def format_protein_id(protein_id):
    """
    Format protein ID to match FASTA format:
    - First 4 characters: lowercase
    - Chain (after underscore or dash): uppercase
    Example: 1A0C_A -> 1a0c_A, 1AQK-L -> 1aqk_L
    """
    # Handle both underscores and dashes as separators
    if '_' in protein_id:
        pdb_code, chain = protein_id.split('_', 1)
        return f"{pdb_code.lower()}_{chain.upper()}"
    elif '-' in protein_id:
        pdb_code, chain = protein_id.split('-', 1)
        return f"{pdb_code.lower()}_{chain.upper()}"  # Convert dash to underscore
    else:
        return protein_id.lower()

def read_protein_sequence(protein_dir):
    """Read amino acid sequence from protein directory's sequence.txt file"""
    seq_file = os.path.join(protein_dir, "sequence.txt")
    if os.path.exists(seq_file):
        try:
            with open(seq_file, 'r') as f:
                sequence = f.read().strip()
                # Remove any whitespace or newlines, keep only valid AA characters
                sequence = ''.join(c for c in sequence if c.isupper() and c.isalpha())
                return sequence
        except Exception as e:
            print(f"⚠ Error reading {seq_file}: {e}")
            return ""
    return ""

def calculate_similarity_for_candidate(args):
    """Thread worker function for similarity calculation"""
    protein_sequence, fasta_id, fasta_seq = args
    try:
        similarity = calculate_similarity(protein_sequence, fasta_seq)
        
        # DIAGNOSTIC: Check for unexpected return types
        if not isinstance(similarity, (int, float)):
            print(f"🔍 WARNING: Unexpected similarity type for {fasta_id}: {type(similarity)} = {similarity}")
            return fasta_id, 0.0
        
        if similarity < 0 or similarity > 1.5:  # Allow slight > 1.0 due to normalization
            print(f"🔍 WARNING: Similarity out of range for {fasta_id}: {similarity}")
        
        return fasta_id, similarity
    except Exception as e:
        print(f"⚠ Error in similarity calculation for {fasta_id}: {e}")
        return fasta_id, 0.0

# ─── Main Processing Pipeline ───────────────────────────────────────────

print("🧬 Starting OPTIMIZED Hierarchical Protein Sequence Matching Pipeline")
print("=" * 70)

# Step 1: Load FASTA sequences
print("▶ Loading FASTA sequences...")
fasta_sequences = {}
fasta_lengths = {}

if not os.path.exists(FASTA_FILE):
    print(f"❌ FASTA file not found: {FASTA_FILE}")
    exit(1)

with open(FASTA_FILE, 'r') as f:
    current_id = None
    current_seq = ""
    
    for line in tqdm(f, desc="Reading FASTA", unit="lines"):
        line = line.strip()
        if line.startswith('>'):
            # Save previous sequence if exists
            if current_id and current_seq:
                fasta_sequences[current_id] = current_seq
                fasta_lengths[current_id] = len(current_seq)
            
            # Start new sequence
            current_id = line[1:]  # Remove '>' character
            current_seq = ""
        else:
            current_seq += line
    
    # Don't forget the last sequence
    if current_id and current_seq:
        fasta_sequences[current_id] = current_seq
        fasta_lengths[current_id] = len(current_seq)

print(f"   ✓ Loaded {len(fasta_sequences):,} sequences from FASTA")

# Step 1.5: BUILD OPTIMIZED INDICES
print("▶ Building optimized search indices...")

# Stage 2 optimization: sequence -> list of IDs mapping for O(1) lookup
sequence_to_ids = defaultdict(list)
for fasta_id, sequence in tqdm(fasta_sequences.items(), desc="Building sequence index", unit="seqs"):
    sequence_to_ids[sequence].append(fasta_id)

# Stage 3 optimization: length-based index for prefiltering
length_index = defaultdict(list)
for fasta_id, length in fasta_lengths.items():
    length_index[length].append(fasta_id)

print(f"   ✓ Built sequence index with {len(sequence_to_ids):,} unique sequences")
print(f"   ✓ Built length index with {len(length_index):,} length groups")

# Step 2: Get protein directories
print("▶ Scanning protein directories...")
if not os.path.exists(PROTEIN_DATA_DIR):
    print(f"❌ Protein data directory not found: {PROTEIN_DATA_DIR}")
    exit(1)

protein_dirs = [d for d in os.listdir(PROTEIN_DATA_DIR) 
                if os.path.isdir(os.path.join(PROTEIN_DATA_DIR, d))]
print(f"   ✓ Found {len(protein_dirs):,} protein directories")

# Step 3: Initialize tracking variables
results = []
stage1_matches = 0
stage2_matches = 0  
stage3_matches = 0
no_matches = 0

# Step 4: Process each protein with optimizations
print("▶ Processing proteins through OPTIMIZED hierarchical matching...")
print(f"   Stage 1: Exact PID matching (O(1) hash lookup)")
print(f"   Stage 2: Exact sequence matching (O(1) hash lookup - OPTIMIZED!)")
print(f"   Stage 3: Similarity matching (±{LENGTH_TOLERANCE} AA prefilter + {MAX_THREADS} threads)")

# DIAGNOSTIC: Test similarity function with a simple example
print("\n🔍 DIAGNOSTIC: Testing similarity function...")
test_seq1 = "ACDEFG"
test_seq2 = "ACDEFG"  # Identical
test_seq3 = "ACDEXY"  # Different
try:
    sim_identical = calculate_similarity(test_seq1, test_seq2)
    sim_different = calculate_similarity(test_seq1, test_seq3)
    print(f"   Identical sequences similarity: {sim_identical} (should be ~1.0)")
    print(f"   Different sequences similarity: {sim_different} (should be <1.0)")
    
    if not isinstance(sim_identical, (int, float)) or not isinstance(sim_different, (int, float)):
        print("❌ CRITICAL: Similarity function returning non-numeric values!")
        exit(1)
except Exception as e:
    print(f"❌ CRITICAL: Similarity function test failed: {e}")
    exit(1)

print("   ✓ Similarity function working correctly\n")

start_time = time.time()
stage3_calculations = 0  # Track number of similarity calculations

for protein_id in tqdm(protein_dirs, desc="Matching proteins", unit="proteins"):
    protein_path = os.path.join(PROTEIN_DATA_DIR, protein_id)
    
    # Format the protein ID to match FASTA format
    formatted_id = format_protein_id(protein_id)
    
    # Read the protein sequence
    protein_sequence = read_protein_sequence(protein_path)
    if not protein_sequence:
        results.append([protein_id, "NO_SEQUENCE", "NA", "No Sequence"])
        no_matches += 1
        continue
    
    protein_length = len(protein_sequence)
    matched = False
    
    # ═══ STAGE 1: Exact PID Match (unchanged) ═══
    if formatted_id in fasta_sequences:
        results.append([protein_id, formatted_id, "NA", "Stage 1"])
        stage1_matches += 1
        matched = True
        continue
    
    # ═══ STAGE 2: Exact Sequence Match (OPTIMIZED!) ═══ 
    if not matched:
        # O(1) lookup instead of O(n) linear search!
        matching_ids = sequence_to_ids.get(protein_sequence, [])
        if matching_ids:
            # Take first match (same logic as original)
            results.append([protein_id, matching_ids[0], "NA", "Stage 2"])
            stage2_matches += 1
            matched = True
    
    # ═══ STAGE 3: Similarity Match with Multithreading (OPTIMIZED!) ═══
    if not matched:
        best_match_id = None
        best_similarity = SIMILARITY_THRESHOLD
        
        # Get candidate sequences within length tolerance (same logic)
        candidates = []
        for length in range(max(1, protein_length - LENGTH_TOLERANCE), 
                          protein_length + LENGTH_TOLERANCE + 1):
            candidates.extend(length_index[length])
        
        if candidates:
            # DIAGNOSTIC: Track Stage 3 calculations
            stage3_calculations += len(candidates)
            
            # MULTITHREADED similarity calculations!
            if MAX_THREADS > 1 and len(candidates) > 10:  # Use threading for larger candidate sets
                # Prepare arguments for worker threads
                similarity_args = [(protein_sequence, fasta_id, fasta_sequences[fasta_id]) 
                                 for fasta_id in candidates]
                
                # Calculate similarities in parallel
                with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
                    future_to_id = {executor.submit(calculate_similarity_for_candidate, args): args[1] 
                                  for args in similarity_args}
                    
                    for future in as_completed(future_to_id):
                        fasta_id, similarity = future.result()
                        
                        # DIAGNOSTIC: Validate similarity result
                        if not isinstance(similarity, (int, float)):
                            print(f"🔍 WARNING: Non-numeric similarity from thread for {protein_id}: {type(similarity)}")
                            continue
                            
                        if similarity > best_similarity:
                            best_similarity = similarity
                            best_match_id = fasta_id
            else:
                # Single-threaded for small candidate sets (avoid threading overhead)
                for fasta_id in candidates:
                    fasta_seq = fasta_sequences[fasta_id]
                    similarity = calculate_similarity(protein_sequence, fasta_seq)
                    
                    # DIAGNOSTIC: Validate similarity result
                    if not isinstance(similarity, (int, float)):
                        print(f"🔍 WARNING: Non-numeric similarity for {protein_id} vs {fasta_id}: {type(similarity)}")
                        continue
                    
                    if similarity > best_similarity:
                        best_similarity = similarity
                        best_match_id = fasta_id
        
        if best_match_id:
            # DIAGNOSTIC: Show some successful matches
            if stage3_matches < 5:  # Show first 5 Stage 3 matches as examples
                print(f"🔍 Stage 3 match example: {protein_id} -> {best_match_id} (similarity: {best_similarity:.4f})")
            
            results.append([protein_id, best_match_id, f"{best_similarity:.4f}", "Stage 3"])
            stage3_matches += 1
        else:
            results.append([protein_id, "NO_MATCH", "NA", "No Match"])
            no_matches += 1

processing_time = time.time() - start_time

# Step 5: Write results to TSV
print("▶ Writing results to TSV...")
with open(OUTPUT_TSV, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerow(['original_id', 'matched_id', 'similarity_score', 'notes'])
    writer.writerows(results)

# Step 6: Summary Statistics
print("\n" + "="*70)
print("🎯 OPTIMIZED MATCHING RESULTS SUMMARY")
print("="*70)
print(f"Total proteins processed:     {len(protein_dirs):,}")
print(f"Stage 1 (Exact PID):         {stage1_matches:,} ({stage1_matches/len(protein_dirs)*100:.1f}%)")
print(f"Stage 2 (Exact sequence):    {stage2_matches:,} ({stage2_matches/len(protein_dirs)*100:.1f}%)")  
print(f"Stage 3 (Similarity):        {stage3_matches:,} ({stage3_matches/len(protein_dirs)*100:.1f}%)")
print(f"No matches found:            {no_matches:,} ({no_matches/len(protein_dirs)*100:.1f}%)")
print(f"\nTotal matched:               {stage1_matches + stage2_matches + stage3_matches:,} ({(stage1_matches + stage2_matches + stage3_matches)/len(protein_dirs)*100:.1f}%)")

# Performance metrics
total_matched = stage1_matches + stage2_matches + stage3_matches
proteins_per_second = len(protein_dirs) / processing_time
print(f"\n⚡ PERFORMANCE METRICS")
print(f"Processing time:              {processing_time:.1f} seconds")
print(f"Speed:                        {proteins_per_second:.1f} proteins/second")
print(f"Similarity algorithm:         {'parasail (fast)' if 'parasail' in str(calculate_similarity) else 'Biopython (standard)'}")
print(f"Threading:                    {MAX_THREADS} threads for Stage 3")

# DIAGNOSTIC: Stage 3 statistics
print(f"\n🔍 STAGE 3 DIAGNOSTICS")
print(f"Total similarity calculations: {stage3_calculations:,}")
if stage3_matches > 0:
    avg_calcs_per_match = stage3_calculations / max(1, (len(protein_dirs) - stage1_matches - stage2_matches))
    print(f"Avg calculations per protein:  {avg_calcs_per_match:.1f}")
    print(f"Stage 3 success rate:         {stage3_matches}/{len(protein_dirs) - stage1_matches - stage2_matches} proteins reached Stage 3")
else:
    print(f"No Stage 3 matches found - this might indicate an issue!")

# File size info
output_size = Path(OUTPUT_TSV).stat().st_size
print(f"\n📁 Output file: {OUTPUT_TSV} ({output_size/1024:.1f} KB)")

print(f"\n✅ OPTIMIZED Hierarchical matching pipeline completed successfully!")
print(f"⏱ Results saved to: {OUTPUT_TSV}")
print(f"🚀 Performance improvement: ~{2-10}x faster than original (depending on dataset)")

✓ Using Biopython for similarity calculations (RELIABLE MODE)
🧬 Starting OPTIMIZED Hierarchical Protein Sequence Matching Pipeline
▶ Loading FASTA sequences...


Reading FASTA: 0lines [00:00, ?lines/s]

   ✓ Loaded 131,487 sequences from FASTA
▶ Building optimized search indices...


Building sequence index:   0%|          | 0/131487 [00:00<?, ?seqs/s]

   ✓ Built sequence index with 131,482 unique sequences
   ✓ Built length index with 1,722 length groups
▶ Scanning protein directories...
   ✓ Found 29,740 protein directories
▶ Processing proteins through OPTIMIZED hierarchical matching...
   Stage 1: Exact PID matching (O(1) hash lookup)
   Stage 2: Exact sequence matching (O(1) hash lookup - OPTIMIZED!)
   Stage 3: Similarity matching (±15 AA prefilter + 8 threads)

🔍 DIAGNOSTIC: Testing similarity function...
   Identical sequences similarity: 1.0 (should be ~1.0)
   Different sequences similarity: 0.6666666666666666 (should be <1.0)
   ✓ Similarity function working correctly



Matching proteins:   0%|          | 0/29740 [00:00<?, ?proteins/s]

🔍 Stage 3 match example: 1B3Z-A -> 1b30_A (similarity: 0.9967)
🔍 Stage 3 match example: 1BVS-A -> 7oa5_A (similarity: 0.9852)
🔍 Stage 3 match example: 1I1X-A -> 1i1w_A (similarity: 0.9967)
🔍 Stage 3 match example: 1I8O-A -> 1fj0_A (similarity: 0.9912)
🔍 Stage 3 match example: 1MFU-A -> 1jxk_A (similarity: 0.9980)
▶ Writing results to TSV...

🎯 OPTIMIZED MATCHING RESULTS SUMMARY
Total proteins processed:     29,740
Stage 1 (Exact PID):         22,744 (76.5%)
Stage 2 (Exact sequence):    6,955 (23.4%)
Stage 3 (Similarity):        41 (0.1%)
No matches found:            0 (0.0%)

Total matched:               29,740 (100.0%)

⚡ PERFORMANCE METRICS
Processing time:              202.0 seconds
Speed:                        147.2 proteins/second
Similarity algorithm:         Biopython (standard)
Threading:                    8 threads for Stage 3

🔍 STAGE 3 DIAGNOSTICS
Total similarity calculations: 303,760
Avg calculations per protein:  7408.8
Stage 3 success rate:         41/41 proteins reach

In [3]:
# %%time
"""
Add A3M File Counts to Protein Matching Results
=============================================
This script takes the protein matching results and adds a new column showing
how many .a3m alignment files each matched protein has from the OpenFold dataset.

Input files:
- protein_matching_results.tsv (matching results)
- openfold_pdb_a3m_counts.tsv (a3m file counts per protein)

Output:
- protein_matching_results_counts.tsv (original data + n_a3m_files column)
"""

# ─── Configuration ──────────────────────────────────────────────────────
MATCHING_RESULTS = r"C:\Users\rfrjo\Documents\Codebases\PFP_Testing\notebooks\protein_matching_results.tsv"
A3M_COUNTS_FILE = r"C:\Users\rfrjo\Documents\Codebases\PFP_Testing\notebooks\openfold_pdb_a3m_counts.tsv"
OUTPUT_FILE = r"C:\Users\rfrjo\Documents\Codebases\PFP_Testing\notebooks\protein_matching_results_counts.tsv"

# ─── Imports ────────────────────────────────────────────────────────────
import pandas as pd
import numpy as np
from pathlib import Path
import time

print("🧬 Adding A3M File Counts to Protein Matching Results")
print("=" * 60)

start_time = time.time()

# ─── Step 1: Load the A3M counts file ───────────────────────────────────
print("▶ Loading A3M counts file...")
if not Path(A3M_COUNTS_FILE).exists():
    print(f"❌ A3M counts file not found: {A3M_COUNTS_FILE}")
    exit(1)

try:
    a3m_counts_df = pd.read_csv(A3M_COUNTS_FILE, sep='\t')
    print(f"   ✓ Loaded {len(a3m_counts_df):,} protein A3M counts")
    print(f"   Columns: {list(a3m_counts_df.columns)}")
    
    # Create a dictionary for fast lookup: pid -> n_a3m_files
    a3m_counts_dict = dict(zip(a3m_counts_df['pid'], a3m_counts_df['n_a3m_files']))
    print(f"   ✓ Created lookup dictionary with {len(a3m_counts_dict):,} entries")
    
except Exception as e:
    print(f"❌ Error loading A3M counts file: {e}")
    exit(1)

# ─── Step 2: Load the protein matching results ──────────────────────────
print("▶ Loading protein matching results...")
if not Path(MATCHING_RESULTS).exists():
    print(f"❌ Matching results file not found: {MATCHING_RESULTS}")
    exit(1)

try:
    matching_df = pd.read_csv(MATCHING_RESULTS, sep='\t')
    print(f"   ✓ Loaded {len(matching_df):,} protein matching results")
    print(f"   Columns: {list(matching_df.columns)}")
    
except Exception as e:
    print(f"❌ Error loading matching results file: {e}")
    exit(1)

# ─── Step 3: Add A3M counts column ───────────────────────────────────────
print("▶ Adding A3M file counts...")

def get_a3m_count(matched_id):
    """Get the number of A3M files for a matched protein ID"""
    if pd.isna(matched_id) or matched_id in ['NO_MATCH', 'NO_SEQUENCE']:
        return 0
    return a3m_counts_dict.get(matched_id, 0)

# Add the new column
matching_df['n_a3m_files'] = matching_df['matched_id'].apply(get_a3m_count)

print(f"   ✓ Added n_a3m_files column")

# ─── Step 4: Analyze and summarize ───────────────────────────────────────
print("▶ Analyzing A3M file distribution...")

# Basic statistics
total_proteins = len(matching_df)
proteins_with_a3m = len(matching_df[matching_df['n_a3m_files'] > 0])
proteins_without_a3m = len(matching_df[matching_df['n_a3m_files'] == 0])

# A3M count statistics for matched proteins only
matched_proteins = matching_df[matching_df['n_a3m_files'] > 0]
if len(matched_proteins) > 0:
    a3m_stats = matched_proteins['n_a3m_files'].describe()
    total_a3m_files = matched_proteins['n_a3m_files'].sum()
else:
    a3m_stats = None
    total_a3m_files = 0

# Stage-wise A3M statistics
stage_a3m_stats = matching_df.groupby('notes')['n_a3m_files'].agg(['count', 'sum', 'mean', 'std']).round(2)

# ─── Step 5: Save results ────────────────────────────────────────────────
print("▶ Saving enhanced results...")
try:
    matching_df.to_csv(OUTPUT_FILE, sep='\t', index=False)
    output_size = Path(OUTPUT_FILE).stat().st_size
    print(f"   ✓ Saved to: {OUTPUT_FILE}")
    print(f"   File size: {output_size/1024:.1f} KB")
    
except Exception as e:
    print(f"❌ Error saving results: {e}")
    exit(1)

processing_time = time.time() - start_time

# ─── Step 6: Print comprehensive summary ─────────────────────────────────
print("\n" + "="*70)
print("📊 A3M FILE COUNT ANALYSIS SUMMARY")
print("="*70)

print(f"Total proteins processed:     {total_proteins:,}")
print(f"Proteins with A3M files:      {proteins_with_a3m:,} ({proteins_with_a3m/total_proteins*100:.1f}%)")
print(f"Proteins without A3M files:   {proteins_without_a3m:,} ({proteins_without_a3m/total_proteins*100:.1f}%)")

if a3m_stats is not None:
    print(f"\n🧬 A3M FILE STATISTICS (for proteins with A3M files):")
    print(f"Total A3M files available:    {total_a3m_files:,}")
    print(f"Average A3M files per protein: {a3m_stats['mean']:.1f}")
    print(f"Median A3M files per protein:  {a3m_stats['50%']:.0f}")
    print(f"Min A3M files:                {a3m_stats['min']:.0f}")
    print(f"Max A3M files:                {a3m_stats['max']:.0f}")
    print(f"Standard deviation:           {a3m_stats['std']:.1f}")

print(f"\n📈 STAGE-WISE A3M DISTRIBUTION:")
print("Stage                Count    Total A3M    Avg A3M    Std Dev")
print("-" * 60)
for stage, stats in stage_a3m_stats.iterrows():
    avg_a3m = stats['mean'] if stats['count'] > 0 else 0
    std_a3m = stats['std'] if stats['count'] > 0 and not pd.isna(stats['std']) else 0
    print(f"{stage:<18} {stats['count']:>6,} {stats['sum']:>10,.0f} {avg_a3m:>9.1f} {std_a3m:>9.1f}")

# A3M file count distribution
print(f"\n📊 A3M FILE COUNT DISTRIBUTION:")
a3m_distribution = matching_df['n_a3m_files'].value_counts().sort_index()
print("A3M Files    Protein Count    Percentage")
print("-" * 40)
for count, freq in a3m_distribution.head(10).items():  # Show top 10 most common counts
    percentage = freq / total_proteins * 100
    print(f"{count:>8} {freq:>13,} {percentage:>12.1f}%")

if len(a3m_distribution) > 10:
    remaining = len(a3m_distribution) - 10
    print(f"... and {remaining} more unique counts")

# High-value proteins (lots of A3M files)
high_a3m_threshold = 50
high_a3m_proteins = matching_df[matching_df['n_a3m_files'] >= high_a3m_threshold]
if len(high_a3m_proteins) > 0:
    print(f"\n⭐ HIGH-VALUE PROTEINS (≥{high_a3m_threshold} A3M files):")
    print(f"Found {len(high_a3m_proteins):,} proteins with extensive alignment data")
    top_5 = high_a3m_proteins.nlargest(5, 'n_a3m_files')[['original_id', 'matched_id', 'n_a3m_files', 'notes']]
    print("Top 5 proteins by A3M file count:")
    for _, row in top_5.iterrows():
        print(f"  {row['original_id']} -> {row['matched_id']}: {row['n_a3m_files']} files ({row['notes']})")

# Missing A3M data analysis
no_a3m_by_stage = matching_df[matching_df['n_a3m_files'] == 0].groupby('notes').size()
if len(no_a3m_by_stage) > 0:
    print(f"\n⚠️  PROTEINS WITHOUT A3M FILES BY STAGE:")
    for stage, count in no_a3m_by_stage.items():
        percentage = count / len(matching_df[matching_df['notes'] == stage]) * 100
        print(f"  {stage}: {count:,} proteins ({percentage:.1f}% of stage)")

print(f"\n⚡ PERFORMANCE:")
print(f"Processing time:              {processing_time:.1f} seconds")
print(f"Data enhancement rate:        {total_proteins/processing_time:.0f} proteins/second")

print(f"\n📁 OUTPUT:")
print(f"Enhanced file: {OUTPUT_FILE}")
print(f"New columns: original_id, matched_id, similarity_score, notes, n_a3m_files")

print(f"\n✅ A3M count analysis completed successfully!")
print("🔬 Your protein matching results now include alignment file counts!")

🧬 Adding A3M File Counts to Protein Matching Results
▶ Loading A3M counts file...
   ✓ Loaded 131,487 protein A3M counts
   Columns: ['pid', 'n_a3m_files']
   ✓ Created lookup dictionary with 131,487 entries
▶ Loading protein matching results...
   ✓ Loaded 29,740 protein matching results
   Columns: ['original_id', 'matched_id', 'similarity_score', 'notes']
▶ Adding A3M file counts...
   ✓ Added n_a3m_files column
▶ Analyzing A3M file distribution...
▶ Saving enhanced results...
   ✓ Saved to: C:\Users\rfrjo\Documents\Codebases\PFP_Testing\notebooks\protein_matching_results_counts.tsv
   File size: 757.0 KB

📊 A3M FILE COUNT ANALYSIS SUMMARY
Total proteins processed:     29,740
Proteins with A3M files:      29,740 (100.0%)
Proteins without A3M files:   0 (0.0%)

🧬 A3M FILE STATISTICS (for proteins with A3M files):
Total A3M files available:    88,919
Average A3M files per protein: 3.0
Median A3M files per protein:  3
Min A3M files:                1
Max A3M files:                3
Stan