In [2]:
import os
from Bio.PDB import PDBParser, PPBuilder
import pandas as pd

In [None]:
def extract_sequences(folder_path):
    parser = PDBParser(QUIET=True)
    ppb = PPBuilder()
    sequences = {}
    
    for pdb_file in os.listdir(folder_path):
        if pdb_file.endswith(".pdb"):
            pdb_path = os.path.join(folder_path, pdb_file)
            structure = parser.get_structure(pdb_file, pdb_path)
            seqs = []
            for model in structure:
                for chain in model:
                    for pp in ppb.build_peptides(chain):
                        seqs.append(str(pp.get_sequence()))
            sequences[pdb_file] = ''.join(seqs)
    return sequences

# 路径定义
base_path = "designs"
grade_a_path = os.path.join(base_path, "Grade_A")
grade_b_path = os.path.join(base_path, "Grade_B")
grade_c_path = os.path.join(base_path, "Grade_C")

# 提取序列
sequences_a = extract_sequences(grade_a_path)
sequences_b = extract_sequences(grade_b_path)
sequences_c = extract_sequences(grade_c_path)

# 写入 designs.xlsx（Grade_A）
df_a = pd.DataFrame([{"filename": name, "sequence": seq} for name, seq in sequences_a.items()])
df_a.to_excel("designs.xlsx", index=False)

# 写入 design_b.xlsx（Grade_B 不在 Grade_A 中）
unique_b = [
    {"filename": name, "sequence": seq}
    for name, seq in sequences_b.items()
    if name not in sequences_a
]
if unique_b:
    pd.DataFrame(unique_b).to_excel("design_b.xlsx", index=False)
else:
    print("No unique files found in Grade_B.")

# 写入 design_c.xlsx（Grade_C 不在 Grade_B 中）
unique_c = [
    {"filename": name, "sequence": seq}
    for name, seq in sequences_c.items()
    if name not in sequences_b
]
if unique_c:
    pd.DataFrame(unique_c).to_excel("design_c.xlsx", index=False)
else:
    print("No unique files found in Grade_C.")

In [2]:
import pandas as pd
from Bio.Seq import Seq

file_path = "ordered.xlsx"
df = pd.read_excel(file_path)

prefix = "TGGTCTCATGTGGCTCTTCTAGT"
suffix = "TAAAGAAGAGCGACCTGAGACCA"

def trim_and_translate(dna_seq):
    if not isinstance(dna_seq, str):
        return ""
    if dna_seq.startswith(prefix):
        dna_seq = dna_seq[len(prefix):]
    if dna_seq.endswith(suffix):
        dna_seq = dna_seq[:-len(suffix)]
    seq_obj = Seq(dna_seq)
    protein_seq = seq_obj.translate(to_stop=True)
    return str(protein_seq)

df["Protein"] = df["Sequence"].apply(trim_and_translate)

df.to_excel("output_with_protein.xlsx", index=False)

In [2]:
import pandas as pd

# 读文件
file_path = "output_with_protein.xlsx"
df = pd.read_excel(file_path)

def exact_match(seq1, seq2):
    # 若有空值，直接返回False
    if not isinstance(seq1, str) or not isinstance(seq2, str):
        return False
    # 长度不一样直接False
    if len(seq1) != len(seq2):
        return False
    # 逐字符比较
    return all(a == b for a, b in zip(seq1, seq2))

# 新增一列，表示匹配结果
df["Match"] = df.apply(lambda row: str(exact_match(row["Sequence"], row["Protein"])), axis=1)

# df["Match"] = df.apply(lambda row: exact_match(row["Sequence"], row["Protein"]), axis=1)

# 保存结果
df.to_excel("output_with_match.xlsx", index=False)