In [None]:
import csv
import os
import requests

pdb_ids = []

csv_path = "../data/ps4_data.csv"  # change this to your actual path
output_dir = "../data/mmcif"

# Known replacements for Obsolete PDB IDs
obsolete_replacements = {
    "4fxg": "5jpm",
    "4bgo": "7q1g",
    "1h6w": "5lye",
    "5esy": "8f9y"
}

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Get all PDB IDs from the CSV file
with open(csv_path, newline="", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        chain_id = row.get("chain_id", "")
        if chain_id:
            pdb_ids.append(chain_id[:4].lower())  # first 4 characters, lowercase

# Remove duplicates while preserving order
pdb_ids = list(dict.fromkeys(pdb_ids))

print(f"Found {len(pdb_ids)} unique PDB IDs to download")

# Pull data for each PDB ID from https://pdb-redo.eu/dssp/db/{pdb_id}/mmcif
for i, pdb_id in enumerate(pdb_ids):

    output_path = os.path.join(output_dir, f"{pdb_id}.cif")
    # Skip if file already exists
    if os.path.exists(output_path):
        # print(f"[{i+1}/{len(pdb_ids)}] Skipping {pdb_id} (already exists)")
        continue

    # Check for obsolete PDB IDs and replace if necessary
    if pdb_id in obsolete_replacements:
        pdb_id = obsolete_replacements[pdb_id]
    
    url = f"https://pdb-redo.eu/dssp/db/{pdb_id}/mmcif"
    
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        
        with open(output_path, "wb") as f:
            f.write(response.content)
        
        print(f"[{i+1}/{len(pdb_ids)}] Downloaded {pdb_id}.cif")
    except requests.exceptions.RequestException as e:
        print(f"[{i+1}/{len(pdb_ids)}] Failed to download {pdb_id}: {e}")

print("Done!")