# Parse ProteinMPNN sequences

In [9]:
import pandas as pd
import glob
import re
import os

mpnn_results="output/3HB/seqs/3HB.fa"

In [10]:
T_values, sample_values, score_values, global_score_values, seq_recovery_values, sequences = [], [], [], [], [], []

with open(mpnn_results, "r") as file:
    lines = file.readlines()

    # Skip the first two rows
    lines = lines[2:]

    # Iterate over the rest
    for i, line in enumerate(lines):
        if line.startswith(">"):
            scores = re.split(",\s*|,", line[1:])

            # Extract the desired fields from the header
            T_value = float(scores[0].split("=")[1])
            sample_value = int(scores[1].split("=")[1])
            score_value = float(scores[2].split("=")[1])
            global_score_value = float(scores[3].split("=")[1])
            seq_recovery_value = float(scores[4].split("=")[1])

            # Save the values
            T_values.append(T_value)
            sample_values.append(sample_value)
            score_values.append(score_value)
            global_score_values.append(global_score_value)
            seq_recovery_values.append(seq_recovery_value)

            # Save the next line as the sequence
            sequence = lines[i + 1].strip()
            sequences.append(sequence)

# Create a dataframe from the collected data
df = pd.DataFrame({
    "t": T_values,
    "sample": sample_values,
    "score": score_values,
    "global_Score": global_score_values,
    "seq_Recovery": seq_recovery_values,
    "sequence": sequences
})

df.to_csv(f"{os.path.dirname(mpnn_results)}/seqs.csv", index=False)
print(f"number of sequences: {df.shape[0]}")

number of sequences: 1000


# Prepare inputs for AF2 predictions
#### Group sequences for batch prediction

In [11]:
df=pd.read_csv(f"{os.path.dirname(mpnn_results)}/seqs.csv")


# Set number of sequences to be predicted per job
grouped_sequences = 30
num_seq=df.shape[0]
#num_seq=1000

array_jobs = num_seq // grouped_sequences
if num_seq % grouped_sequences != 0:
    array_jobs += 1

print(f"Slurm will run: {array_jobs} AF2 jobs, with each one predicting up to {grouped_sequences} sequences")

Slurm will run: 34 AF2 jobs, with each one predicting up to 30 sequences


In [12]:
# Set AF2 input folder
basename=os.path.basename(mpnn_results).split('.')[0]
af2_input_folder=f"output/{basename}/tmp/af2_in"
os.makedirs(af2_input_folder, exist_ok=True)

# Iterate over sequences and generate AF2 inputs - fasta files
for job_index in range(array_jobs):
    start_index = job_index * grouped_sequences
    end_index = start_index + grouped_sequences
    sequences = df["sequence"][start_index:end_index]

    # Generate file path
    file_path = f"{af2_input_folder}/fasta_{job_index + 1}.fasta"

    # Write sequences to file with unique names
    with open(file_path, "w") as file:
        for i, seq in enumerate(sequences, start=start_index+1):
            file.write(f">{job_index}_{i}_{basename}\n{seq}\n")
print(f"Generated {array_jobs} fasta files in folder: {af2_input_folder}")

Generated 34 fasta files in folder: output/7urv_FMC63/tmp/af2_in


# Run AF2 prediction

In [13]:
import subprocess

# Setup arguments
bash_arguments=f"--array=1-{array_jobs}%18"
colabfold_arguments=f""

command = f"sbatch {bash_arguments} helper_scripts/AF2_array.sh {af2_input_folder} {colabfold_arguments}"

# Run the array bash script
subprocess.run(command, shell=True)

Submitted batch job 139666


CompletedProcess(args='sbatch --array=1-34%18 helper_scripts/AF2_array.sh output/7urv_FMC63/tmp/af2_in ', returncode=0)

In [14]:
!squeue --me

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
 139666_[19-34%18]       gpu AF2_arra  tsatler PD       0:00      1 (JobArrayTaskLimit)
          139666_1       gpu AF2_arra  tsatler  R       0:07      1 compute-0-12
          139666_2       gpu AF2_arra  tsatler  R       0:07      1 compute-0-12
          139666_3       gpu AF2_arra  tsatler  R       0:07      1 compute-0-12
          139666_4       gpu AF2_arra  tsatler  R       0:07      1 compute-0-12
          139666_5       gpu AF2_arra  tsatler  R       0:07      1 compute-0-12
          139666_6       gpu AF2_arra  tsatler  R       0:07      1 compute-0-12
          139666_7       gpu AF2_arra  tsatler  R       0:07      1 compute-0-12
          139666_8       gpu AF2_arra  tsatler  R       0:07      1 compute-0-10
          139666_9       gpu AF2_arra  tsatler  R       0:07      1 compute-0-10
         139666_10       gpu AF2_arra  tsatler  R       0:07      1 compute-0-10
         139666_1