In [5]:
import os
import esm
import time
import csv
import torch
from data import utils as du
from typing import Dict
from Bio import SeqIO


In [6]:
def run_folding(sequence, save_path, target_gpu_index):
    """Run ESMFold on sequence."""


    if torch.cuda.is_available():
        device_index = target_gpu_index
        device_name = f"cuda:{device_index}"
        device = torch.device(device_name)
        model = esm.pretrained.esmfold_v1()
        model = model.to(device)
        model = model.eval()    
    
    
        
    with torch.no_grad():
        output = model.infer_pdb(sequence)

    with open(save_path, "w") as f:
        f.write(output)
    import biotite.structure.io as bsio
    struct = bsio.load_structure(save_path, extra_fields=["b_factor"])
    plddt = struct.b_factor.mean()
    return output, plddt


In [7]:
def run_esm_all(sample_sequences, decoy_pdb_dir, model_dataset):
    total_samples = len(sample_sequences)
    start_time = time.time()

    os.makedirs(decoy_pdb_dir, exist_ok=True)
    plddt_csv_path = os.path.join(decoy_pdb_dir, model_dataset, "plddt_values.csv")

    write_header = not os.path.exists(plddt_csv_path)

    with open(plddt_csv_path, "a", newline="") as csv_file:
        csv_writer = csv.writer(csv_file)
        if write_header:
            csv_writer.writerow(["Sample", "PLDDT"])

        for idx, (sample, sequence) in enumerate(sample_sequences.items(), 1):
            esmf_dir = os.path.join(decoy_pdb_dir, model_dataset)
            os.makedirs(esmf_dir, exist_ok=True)
            esmf_sample_path = os.path.join(esmf_dir, f'{sample}.pdb')

            if os.path.exists(esmf_sample_path):
                print(f"File {esmf_sample_path} already exists. Skipping...")
                continue

            output, plddt_value = run_folding(sequence, esmf_sample_path, 0)
            print(plddt_value)

            csv_writer.writerow([sample, plddt_value])

            elapsed_time = time.time() - start_time
            progress_percentage = (idx / total_samples) * 100
            print(f"Progress: {progress_percentage:.2f}% | Elapsed Time: {elapsed_time:.2f}s | ESM results saved for {sample} in {esmf_sample_path}")

    print(f"ESM calculations for all samples completed and saved to CSV at {plddt_csv_path}.")


In [8]:

def read_predicted_sequences(fasta_file_path):
    predicted_sequences = {}

    for record in SeqIO.parse(fasta_file_path, "fasta"):
        header = record.description
        if "Predicted sequence" in header:
            pdb_id = header.split()[0].split('.')[0]
            sequence = str(record.seq)
            predicted_sequences[pdb_id] = sequence

    return predicted_sequences


# CATH4.2 PiFold

In [5]:
fasta_file_path = "/home/zhengsun/code/protein/ProteinInvBench/results/fasta_files/CATH4.2_PiFold.fasta"
sample_sequences = read_predicted_sequences(fasta_file_path)

In [6]:
decoy_pdb_dir = "/home/zhengsun/code/protein/ProteinInvBench/results/esm_fold_pdb"
run_esm_all(sample_sequences, decoy_pdb_dir, 'cath42pifold')

File /home/zhengsun/code/protein/ProteinInvBench/results/esm_fold_pdb/cath42pifold/1a1x.pdb already exists. Skipping...
File /home/zhengsun/code/protein/ProteinInvBench/results/esm_fold_pdb/cath42pifold/1a2p.pdb already exists. Skipping...
File /home/zhengsun/code/protein/ProteinInvBench/results/esm_fold_pdb/cath42pifold/1a32.pdb already exists. Skipping...
File /home/zhengsun/code/protein/ProteinInvBench/results/esm_fold_pdb/cath42pifold/1a73.pdb already exists. Skipping...
File /home/zhengsun/code/protein/ProteinInvBench/results/esm_fold_pdb/cath42pifold/1a8l.pdb already exists. Skipping...
File /home/zhengsun/code/protein/ProteinInvBench/results/esm_fold_pdb/cath42pifold/1a90.pdb already exists. Skipping...
File /home/zhengsun/code/protein/ProteinInvBench/results/esm_fold_pdb/cath42pifold/1aal.pdb already exists. Skipping...
File /home/zhengsun/code/protein/ProteinInvBench/results/esm_fold_pdb/cath42pifold/1aaz.pdb already exists. Skipping...
File /home/zhengsun/code/protein/Protein

# CATH4.2 ProteinMPNN

In [7]:
fasta_file_path = "/home/zhengsun/code/protein/ProteinInvBench/results/fasta_files/CATH4.2_ProteinMPNN.fasta"
sample_sequences = read_predicted_sequences(fasta_file_path)

In [8]:
decoy_pdb_dir = "/home/zhengsun/code/protein/ProteinInvBench/results/esm_fold_pdb"
run_esm_all(sample_sequences, decoy_pdb_dir, 'cath42mpnn')

49.18970319634704
Progress: 0.09% | Elapsed Time: 25.06s | ESM results saved for 1a1x in /home/zhengsun/code/protein/ProteinInvBench/results/esm_fold_pdb/cath42mpnn/1a1x.pdb
85.61691656590084
Progress: 0.18% | Elapsed Time: 50.22s | ESM results saved for 1a2p in /home/zhengsun/code/protein/ProteinInvBench/results/esm_fold_pdb/cath42mpnn/1a2p.pdb
78.06617052023121
Progress: 0.28% | Elapsed Time: 75.06s | ESM results saved for 1a32 in /home/zhengsun/code/protein/ProteinInvBench/results/esm_fold_pdb/cath42mpnn/1a32.pdb
66.59598199672668
Progress: 0.37% | Elapsed Time: 101.28s | ESM results saved for 1a73 in /home/zhengsun/code/protein/ProteinInvBench/results/esm_fold_pdb/cath42mpnn/1a73.pdb
89.44286034772854
Progress: 0.46% | Elapsed Time: 128.66s | ESM results saved for 1a8l in /home/zhengsun/code/protein/ProteinInvBench/results/esm_fold_pdb/cath42mpnn/1a8l.pdb
55.35636150234741
Progress: 0.55% | Elapsed Time: 153.77s | ESM results saved for 1a90 in /home/zhengsun/code/protein/ProteinInv

# CATH4.3 ProteinMPNN

In [9]:
fasta_file_path = "/home/sean/Desktop/protein/ProteinInvBench/results/fasta_files/CATH4.2_ProteinMPNN.fasta"
sample_sequences = read_predicted_sequences(fasta_file_path)

In [12]:
decoy_pdb_dir = "/home/sean/Desktop/protein/ProteinInvBench/results/esm_fold_pdb"
run_esm_all(sample_sequences, decoy_pdb_dir, 'cath43mpnn')

Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esmfold_3B_v1.pt" to /home/sean/.cache/torch/hub/checkpoints/esmfold_3B_v1.pt
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t36_3B_UR50D.pt" to /home/sean/.cache/torch/hub/checkpoints/esm2_t36_3B_UR50D.pt
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t36_3B_UR50D-contact-regression.pt" to /home/sean/.cache/torch/hub/checkpoints/esm2_t36_3B_UR50D-contact-regression.pt


49.18970319634704
Progress: 0.09% | Elapsed Time: 455.50s | ESM results saved for 1a1x in /home/sean/Desktop/protein/ProteinInvBench/results/esm_fold_pdb/cath43mpnn/1a1x.pdb
85.61691656590084
Progress: 0.18% | Elapsed Time: 485.74s | ESM results saved for 1a2p in /home/sean/Desktop/protein/ProteinInvBench/results/esm_fold_pdb/cath43mpnn/1a2p.pdb
78.06617052023121
Progress: 0.28% | Elapsed Time: 520.89s | ESM results saved for 1a32 in /home/sean/Desktop/protein/ProteinInvBench/results/esm_fold_pdb/cath43mpnn/1a32.pdb
66.59598199672668
Progress: 0.37% | Elapsed Time: 557.74s | ESM results saved for 1a73 in /home/sean/Desktop/protein/ProteinInvBench/results/esm_fold_pdb/cath43mpnn/1a73.pdb
89.44286034772854
Progress: 0.46% | Elapsed Time: 595.06s | ESM results saved for 1a8l in /home/sean/Desktop/protein/ProteinInvBench/results/esm_fold_pdb/cath43mpnn/1a8l.pdb
55.35636150234741
Progress: 0.55% | Elapsed Time: 629.13s | ESM results saved for 1a90 in /home/sean/Desktop/protein/ProteinInvBen