# 1. Save sequence info of 'Result.pt' to fasta

In [28]:
import torch
import numpy as np
import os
from Bio import SeqIO
import requests
from tqdm import tqdm

## 1.1 Results on test sets of 'CATH4.2', 'CATH4.3' for 'PiFold' and 'ProteinMPNN'

In [3]:
def convert_and_save_sequences(result, file_name):
    amino_acid_map = {
        0: 'X', 4: 'L', 5: 'A', 6: 'G', 7: 'V', 8: 'S', 9: 'E', 10: 'R',
        11: 'T', 12: 'I', 13: 'D', 14: 'P', 15: 'K', 16: 'Q', 17: 'N',
        18: 'F', 19: 'Y', 20: 'M', 21: 'H', 22: 'W', 23: 'C'
    }
    
    fasta_content = ""
    
    for i in range(len(result['title'])):
        title = result['title'][i]
        true_seq = result['true_seq'][i]
        pred_seq = result['pred_probs'][i].argmax(dim=1)
        
        converted_true_seq = ''.join(amino_acid_map[num.item()] for num in true_seq)
        converted_pred_seq = ''.join(amino_acid_map[num.item()] for num in pred_seq)
        
        fasta_content += f">{title} True sequence\n{converted_true_seq}\n"
        fasta_content += f">{title} Predicted sequence\n{converted_pred_seq}\n"

    with open(file_name, "w") as file:
        file.write(fasta_content)

In [5]:
base_path = "/home/zhengsun/code/protein/ProteinInvBench/results/model_zoom"
output_directory = "/home/zhengsun/code/protein/ProteinInvBench/results/fasta_files"

os.makedirs(output_directory, exist_ok=True)

for dataset in ['CATH4.2', 'CATH4.3']:
    for method in ['ProteinMPNN', 'PiFold']:
        file_path = os.path.join(base_path, dataset, method, "results.pt")
        fasta_file_name = f"{dataset}_{method}.fasta"
        output_file_path = os.path.join(output_directory, fasta_file_name)
        
        if os.path.exists(file_path):
            result = torch.load(file_path)
            convert_and_save_sequences(result, output_file_path)
            print(f"Processed and saved: {output_file_path}")
        else:
            print(f"File does not exist: {file_path}")

Processed and saved: /home/zhengsun/code/protein/ProteinInvBench/results/fasta_files/CATH4.2_ProteinMPNN.fasta
Processed and saved: /home/zhengsun/code/protein/ProteinInvBench/results/fasta_files/CATH4.2_PiFold.fasta
Processed and saved: /home/zhengsun/code/protein/ProteinInvBench/results/fasta_files/CATH4.3_ProteinMPNN.fasta
Processed and saved: /home/zhengsun/code/protein/ProteinInvBench/results/fasta_files/CATH4.3_PiFold.fasta


# 2 Download PDB files

In [29]:
def extract_pdb_id(description):
    pdb_id = description.split()[0]
    pdb_id = pdb_id.split('.')[0]
    return pdb_id

In [30]:
def download_pdb(pdb_id, output_dir):
    pdb_file_path = os.path.join(output_dir, f"{pdb_id.lower()}.pdb")
    if not os.path.exists(pdb_file_path):
        url = f"https://files.rcsb.org/download/{pdb_id.upper()}.pdb"
        response = requests.get(url)
        if response.status_code == 200:
            with open(pdb_file_path, 'w') as file:
                file.write(response.text)
        else:
            print(f"Failed to download {pdb_id.upper()}. HTTP Status: {response.status_code}")

In [None]:
def count_total_entries(fasta_dir):
    total_entries = 0
    for fasta_file in os.listdir(fasta_dir):
        if fasta_file.endswith(".fasta"):
            fasta_path = os.path.join(fasta_dir, fasta_file)
            records = list(SeqIO.parse(fasta_path, "fasta"))
            total_entries += len(records)
    return total_entries

In [31]:
def process_fasta_files(fasta_dir, output_dir):
    fasta_files = [f for f in os.listdir(fasta_dir) if f.endswith(".fasta")]
    for fasta_file in tqdm(fasta_files, desc="Processing FASTA files"):
        fasta_path = os.path.join(fasta_dir, fasta_file)
        records = list(SeqIO.parse(fasta_path, "fasta"))
        for record in tqdm(records, desc=f"Downloading PDB files from {fasta_file}", leave=False):
            pdb_id = extract_pdb_id(record.description)
            download_pdb(pdb_id, output_dir)

In [32]:
fasta_directory = "/home/zhengsun/code/protein/ProteinInvBench/results/fasta_files"
output_directory = "/home/zhengsun/code/protein/ProteinInvBench/results/reference_pdb"
os.makedirs(output_directory, exist_ok=True)
process_fasta_files(fasta_directory, output_directory)

Processing FASTA files:   0%|          | 0/4 [00:23<?, ?it/s]


KeyboardInterrupt: 

In [14]:
import requests

def download_pdb(pdb_id):
    url = f"https://files.rcsb.org/download/{pdb_id.upper()}.pdb"
    response = requests.get(url)
    if response.status_code == 200:
        print(f"Download successful for {pdb_id}")
        # 下面的代码可以保存文件到本地
        with open(f"{pdb_id.upper()}.pdb", "w") as file:
            file.write(response.text)
    else:
        print(f"Failed to download {pdb_id}. HTTP Status: {response.status_code}")

# 测试下载
download_pdb('1A1X')  # 使用一个有效的 PDB ID 替换这里


Download successful for 1A1X


IsADirectoryError: [Errno 21] Is a directory: '/home/zhengsun/code/protein/ProteinInvBench/results/fasta_files'