In [1]:
#获取表面氨基酸
import numpy as np
import os
import Bio
import shutil
from Bio.PDB import * 
import sys
import importlib
from IPython.core.debugger import set_trace
from Bio import SeqIO
import pandas as pd
from tqdm import tqdm
import warnings
from Bio import BiopythonParserWarning

# Local includes
from triangulation.computeMSMS import computeMSMS


def get_surface_aa(path_to_pdb):
    
    vertices1, faces1, normals1, names1, areas1 = computeMSMS(path_to_pdb,\
            protonate=True)
    
    # 给定的列表
    input_list = names1
    
    # 提取每个字符串的第3-5个元素并组成新列表
    extracted_numbers = [x.split('_')[1] for x in input_list]
    
    # 去除重复的元素
    aa_faces = list(set(extracted_numbers))
    aa_faces = [int(x) for x in aa_faces]
    return aa_faces

def selected_surface_aa(surface_index, sequence):
    # 假设这是你的氨基酸位号列表和原始蛋白质序列
    amino_acid_positions = surface_index
    protein_sequence = sequence
    
    #排序
    amino_acid_positions.sort()
    
    # 根据位号筛选氨基酸
    selected_amino_acids = [protein_sequence[pos-1] for pos in amino_acid_positions]
    
    # 组成新的序列
    surface_aa_sequence = ''.join(selected_amino_acids)
    # 打印结果
    return surface_aa_sequence


In [2]:
def get_protein_sequence_from_pdb(pdb_path):
    with open(pdb_path, "r") as pdb_file:
        for record in SeqIO.parse(pdb_file, "pdb-atom"):
            return str(record.seq)

In [3]:
import pandas as pd
import warnings
from tqdm import tqdm

def is_valid_protein_sequence(sequence):
    """
    Check if the given sequence contains only the 20 standard amino acids.
    """
    standard_amino_acids = set("ACDEFGHIKLMNPQRSTVWY")  # Set of valid amino acids
    return all(amino in standard_amino_acids for amino in sequence)

def get_surface_pos(path_to_csv, output_csv):
    df = pd.read_csv(path_to_csv)
    
    warnings.filterwarnings("ignore")

    surface_index_list = []
    bad_data = []
    
    for index, row in tqdm(df.iterrows(), total=len(df)):
        id = row['entry']
        try:
            seq = get_protein_sequence_from_pdb(f"pdbs/{id}.pdb")
            if not is_valid_protein_sequence(seq):
                bad_data.append(id)  
                continue
            surface_index = get_surface_aa(f"pdbs/{id}.pdb")
            surface_index_list.append(surface_index)
            
        except Exception as e:
            print(e)
            bad_data.append(id) 
            continue
    print(bad_data)  
    df["surface_index"] = surface_index_list
    df.to_csv(f'{output_csv}', index=False)

In [5]:
get_surface_pos('1fhe.csv', '1fhe.csv')

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.04it/s]

[]



