In [1]:
import os
import subprocess
import json

"""This is solublempnn to find the sequence for both capping, and the 14AA extended from the capping sequence
"""
# conda activate mlfold
# Input and output directories
folder_with_pdbs = "/home/eva/0_bury_charged_pair/0_Biopython_playground/0_myown_tips_tricks/RFdiffusion/1_cap_calculation/C_cap_add_3ultback"
output_dir = "/home/eva/0_bury_charged_pair/0_Biopython_playground/0_myown_tips_tricks/RFdiffusion/1_cap_calculation/C_cap_add_3ultback/mpnn"

# Ensure the output folder exists
os.makedirs(output_dir, exist_ok=True)

# Paths for intermediate files
path_for_parsed_chains = f"{output_dir}/parsed_pdbs.jsonl"
path_for_assigned_chains = f"{output_dir}/assigned_pdbs.jsonl"
path_for_fixed_positions = f"{output_dir}/fixed_positions_caps.jsonl"
chains_to_design = "A"

# Step 1: Parse PDB files to extract sequences and residue indices
# Command to call parse_multiple_chains.py script
command_parse_chains = [
    "python",
    "/home/eva/ProteinMPNN/helper_scripts/parse_multiple_chains.py",
    "--input_path", folder_with_pdbs,
    "--output_path", path_for_parsed_chains
]

# Run the parsing command
print("Running parse_multiple_chains.py to parse PDB chains...")
subprocess.run(command_parse_chains, check=True)
print("Parsing complete.")

# Step 2: Identify N and C Cap regions and set fixed positions for main body
cap_positions_dict = {}
assigned_chains_dict = {}

# Helper to detect N and C caps based on continuous glycine residues
def identify_cap_regions(sequence):
    n_cap_end_resi = 0
    c_cap_start_resi = len(sequence)

    # Detect N cap: continuous glycine from the start
    for res in sequence:
        if res == 'G':
            n_cap_end_resi += 1
        else:
            break

    # Detect C cap: continuous glycine from the end
    for res in sequence[::-1]:
        if res == 'G':
            c_cap_start_resi -= 1
        else:
            break
    #Add leeway to mutate some wild sequence with mpnn as well
    leeway = 14
    # Determine if only N cap or C cap is present
    if n_cap_end_resi > 0 and c_cap_start_resi == len(sequence):
        return list(range(1, n_cap_end_resi + leeway + 1)), []  # Only N cap, using residue indexes
    elif c_cap_start_resi < len(sequence) and n_cap_end_resi == 0:
        return [], list(range(c_cap_start_resi - leeway, len(sequence) + 1))  # Only C cap, using residue indexes
    else:
        return [], []

# Read parsed JSON and extract positions to fix main body
with open(path_for_parsed_chains, 'r') as json_file:
    parsed_data = [json.loads(line) for line in json_file]

# Generate fixed positions for each chain
for entry in parsed_data:
    pdb_name = entry["name"].split('.')[0]  # Remove file extension if present
    sequence_key = f"seq_chain_{chains_to_design}"

    # Ensure the chain is present in the parsed entry
    if sequence_key not in entry:
        print(f"Warning: Chain {chains_to_design} not found in entry {pdb_name}, skipping.")
        continue

    sequence = entry[sequence_key]

    # Identify cap regions
    n_cap, c_cap = identify_cap_regions(sequence)

    # Define main body fixed positions
    main_body_fixed_positions = [i for i in range(1, len(sequence)+1) if i not in (n_cap+c_cap)]
    
    # Store in dictionary format required by ProteinMPNN
    cap_positions_dict[pdb_name] = {chains_to_design: main_body_fixed_positions}
    print(f"Fixed positions for {pdb_name}: {cap_positions_dict[pdb_name]}")

    # Add assigned chain information with two lists as expected by ProteinMPNN
    assigned_chains_dict[pdb_name] = [[chains_to_design], []]  # Masked chains, visible chains

with open(f"{output_dir}/fixed_positions_caps.jsonl", 'w') as file:
    file.write(json.dumps(cap_positions_dict))
    
# Write assigned chains to JSONL format using the helper script
with open(f"{output_dir}/temp_assigned_chains.json", 'w') as temp_file:
    for pdb, chains in assigned_chains_dict.items():
        temp_file.write(json.dumps({pdb: chains}) + '\n')

command_assign_chains = [
    "python",
    "/home/eva/ProteinMPNN/helper_scripts/assign_fixed_chains.py",
    "--input_path", path_for_parsed_chains,
    "--output_path", path_for_assigned_chains,
    "--chain_list", chains_to_design
]
subprocess.run(command_assign_chains, check=True)
print("Assigned chains JSONL generated.")

print("Generated assigned chains and fixed positions JSON files.")

# Step 3: Run ProteinMPNN with generated JSON files
# Command to call protein_mpnn_run.py script
command_mpnn_run = [
    "python",
    "/home/eva/ProteinMPNN/protein_mpnn_run.py",
    "--jsonl_path", path_for_parsed_chains,
    "--chain_id_jsonl", path_for_assigned_chains,
    "--fixed_positions_jsonl", path_for_fixed_positions,
    "--out_folder", output_dir,
    "--use_soluble_model",
    "--num_seq_per_target", "2",
    "--sampling_temp", "0.1",
    "--omit_AAs", "CWY",
    "--seed", "37",
    "--batch_size", "1"
]

# Run the ProteinMPNN command
print("Running protein_mpnn_run.py to design sequences...")
subprocess.run(command_mpnn_run, check=True)
print("ProteinMPNN run complete.")


Running parse_multiple_chains.py to parse PDB chains...
Parsing complete.
Fixed positions for modified_Ccap_3ult_cropNT_001_cropCT_114_extendlen_033_cycle_01_3: {'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]}
Fixed positions for modified_Ccap_3ult_cropNT_001_cropCT_114_extendlen_028_cycle_02_5: {'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,

  checkpoint = torch.load(checkpoint_path, map_location=device)


Generating sequences for: modified_Ccap_3ult_cropNT_001_cropCT_114_extendlen_033_cycle_01_3
2 sequences of length 147 generated in 1.9488 seconds
Generating sequences for: modified_Ccap_3ult_cropNT_001_cropCT_114_extendlen_028_cycle_02_5
2 sequences of length 142 generated in 2.0798 seconds
Generating sequences for: modified_Ccap_3ult_cropNT_001_cropCT_114_extendlen_032_cycle_02_6
2 sequences of length 146 generated in 1.8653 seconds
Generating sequences for: modified_Ccap_3ult_cropNT_001_cropCT_114_extendlen_029_cycle_02_4
2 sequences of length 143 generated in 1.6829 seconds
Generating sequences for: modified_Ccap_3ult_cropNT_001_cropCT_114_extendlen_030_cycle_03_6
2 sequences of length 144 generated in 1.6961 seconds
Generating sequences for: modified_Ccap_3ult_cropNT_001_cropCT_114_extendlen_027_cycle_01_5
2 sequences of length 141 generated in 1.5652 seconds
ProteinMPNN run complete.
