In [1]:
import pandas as pd

# 原始路径
txt_path = "C:/Users/数学迷迷迷/Desktop/蛋白质/vdjdb.txt"
csv_path = "C:/Users/数学迷迷迷/Desktop/蛋白质/vdjdb.csv"

# 加载并保存
df = pd.read_csv(txt_path, sep='\t')
df.to_csv(csv_path, index=False)

print(f"✅ 成功保存为 CSV 文件，路径为：{csv_path}")


✅ 成功保存为 CSV 文件，路径为：C:/Users/数学迷迷迷/Desktop/蛋白质/vdjdb.csv


In [2]:
import pandas as pd

# 读取完整 CSV 文件
df = pd.read_csv("C:/Users/数学迷迷迷/Desktop/蛋白质/vdjdb.csv")

# 提取前 500 条数据
df_head500 = df.head(500)

# 保存为新文件
df_head500.to_csv("C:/Users/数学迷迷迷/Desktop/蛋白质/vdjdb_head500.csv", index=False)

print("✅ 已成功保存前500条记录为 vdjdb_head500.csv")


✅ 已成功保存前500条记录为 vdjdb_head500.csv


In [3]:
import pandas as pd

# 设置输入文件路径（请根据实际情况修改）
input_path = r"C:\Users\数学迷迷迷\Desktop\蛋白质\protgpt2_finetune_data.txt"

# 读取每一行作为一个蛋白质序列
with open(input_path, "r", encoding="utf-8") as file:
    sequences = file.read().splitlines()

# 转换为DataFrame
df = pd.DataFrame(sequences, columns=["Sequence"])

# 设置输出文件路径
output_path = r"C:\Users\数学迷迷迷\Desktop\蛋白质\protgpt2_finetune_data.csv"

# 保存为CSV文件
df.to_csv(output_path, index=False, encoding="utf-8")

print("转换完成，文件已保存为:", output_path)


转换完成，文件已保存为: C:\Users\数学迷迷迷\Desktop\蛋白质\protgpt2_finetune_data.csv


In [5]:
import pandas as pd
from tqdm import tqdm
from collections import Counter
from rapidfuzz.distance import Levenshtein

# ========= 文件路径 ========= #
vdjdb_path = r"C:\Users\数学迷迷迷\Desktop\蛋白质\vdjdb.csv"
gpt_path = r"C:\Users\数学迷迷迷\Desktop\蛋白质\protgpt2_finetune_data.csv"

# ========= 加载数据 ========= #
print("读取数据中...")
vdjdb_df = pd.read_csv(vdjdb_path)
gpt_df = pd.read_csv(gpt_path)

# ========= 提取 GPT 训练集 CDR3 ========= #
print("提取 GPT CDR3...")
gpt_df["gpt_cdr3"] = gpt_df["Sequence"].str.split("XXX").str[0]
gpt_cdr3_list = gpt_df["gpt_cdr3"].dropna().unique().tolist()

# ========= 过滤掉含 'O' 的训练序列 ========= #
gpt_cdr3_list = [seq for seq in gpt_cdr3_list if "O" not in seq]

# ========= 提取 score=0 且合法的样本 ========= #
print("筛选合法 score=0 样本...")
if "cdr3" not in vdjdb_df.columns:
    raise ValueError("vdjdb.csv 缺少 'cdr3' 列，请检查列名")

score0_df = vdjdb_df[(vdjdb_df["vdjdb.score"] == 0) & (vdjdb_df["cdr3"].notna())].copy()
score0_df = score0_df[~score0_df["cdr3"].str.contains("O")]

cdr3_list = score0_df["cdr3"].tolist()
print(f"有效的 score=0 样本数量: {len(cdr3_list)}")

# ========= 快速计算最小编辑距离 ========= #
min_dists = []
for cdr3 in tqdm(cdr3_list, desc="快速计算最小编辑距离"):
    d_min = float("inf")
    for ref in gpt_cdr3_list:
        d = Levenshtein.distance(cdr3, ref)
        if d < d_min:
            d_min = d
        if d_min <= 2:
            break  # 提前退出加速
    min_dists.append(d_min)

# ========= 保存结果 & 统计 ========= #
score0_df["min_edit_distance"] = min_dists
dist_counter = Counter(min_dists)
count_dist_2_or_less = sum(d <= 2 for d in min_dists)
count_dist_3 = dist_counter[3]

print("\n====== 统计结果 ======")
for dist in sorted(dist_counter):
    print(f"编辑距离 = {dist}: {dist_counter[dist]} 条")
print(f"\n编辑距离 ≤ 2 的样本数（class_2 候选）: {count_dist_2_or_less}")
print(f"编辑距离 = 3 的样本数（class_3 边缘）: {count_dist_3}")


读取数据中...
提取 GPT CDR3...
筛选合法 score=0 样本...
有效的 score=0 样本数量: 4299


快速计算最小编辑距离: 100%|████████████████████████████████████████████████████████| 4299/4299 [01:33<00:00, 45.99it/s]


编辑距离 = 0: 3 条
编辑距离 = 1: 170 条
编辑距离 = 2: 1359 条
编辑距离 = 3: 1369 条
编辑距离 = 4: 636 条
编辑距离 = 5: 395 条
编辑距离 = 6: 218 条
编辑距离 = 7: 88 条
编辑距离 = 8: 29 条
编辑距离 = 9: 10 条
编辑距离 = 10: 9 条
编辑距离 = 11: 3 条
编辑距离 = 12: 1 条
编辑距离 = 14: 2 条
编辑距离 = 15: 1 条
编辑距离 = 19: 1 条
编辑距离 = 20: 3 条
编辑距离 = 22: 1 条
编辑距离 = 23: 1 条

编辑距离 ≤ 2 的样本数（class_2 候选）: 1532
编辑距离 = 3 的样本数（class_3 边缘）: 1369





In [6]:
import pandas as pd
from tqdm import tqdm
from rapidfuzz.distance import Levenshtein

# ========= 文件路径 ========= #
vdjdb_path = r"C:\Users\数学迷迷迷\Desktop\蛋白质\vdjdb.csv"
gpt_path = r"C:\Users\数学迷迷迷\Desktop\蛋白质\protgpt2_finetune_data.csv"
output_path = r"C:\Users\数学迷迷迷\Desktop\蛋白质\vdjdb_labeled.csv"

# ========= 加载数据 ========= #
print("读取数据中...")
vdjdb_df = pd.read_csv(vdjdb_path)
gpt_df = pd.read_csv(gpt_path)

# ========= 过滤 GPT 训练集非法序列并提取 CDR3 ========= #
print("提取 GPT CDR3（去除含O）...")
gpt_df = gpt_df[~gpt_df["Sequence"].str.contains("O", na=False)]  # ✅ 先删非法序列
gpt_df["gpt_cdr3"] = gpt_df["Sequence"].str.split("XXX").str[0]
gpt_cdr3_list = gpt_df["gpt_cdr3"].dropna().unique().tolist()

# ========= 初始化 class 列 ========= #
vdjdb_df["class"] = None

# ========= class = 3: score >= 1 ========= #
vdjdb_df.loc[vdjdb_df["vdjdb.score"] >= 1, "class"] = 3

# ========= 处理 score = 0 的序列 ========= #
print("处理 score = 0 的序列（计算编辑距离）...")
score0_df = vdjdb_df[(vdjdb_df["vdjdb.score"] == 0) & (vdjdb_df["cdr3"].notna())].copy()
score0_df = score0_df[~score0_df["cdr3"].str.contains("O", na=False)]  # ✅ 过滤非法CDR3

# ========= 计算编辑距离 ========= #
min_dists = []
for cdr3 in tqdm(score0_df["cdr3"], desc="快速计算编辑距离"):
    d_min = float("inf")
    for ref in gpt_cdr3_list:
        d = Levenshtein.distance(cdr3, ref)
        if d < d_min:
            d_min = d
        if d_min <= 2:
            break  # 加速退出
    min_dists.append(d_min)

score0_df["min_edit_distance"] = min_dists

# ========= 重新赋予 class（2为接近，1为远离） ========= #
score0_df["class"] = score0_df["min_edit_distance"].apply(lambda d: 2 if d <= 2 else 1)

# ========= 合并回主表 ========= #
vdjdb_df.update(score0_df[["cdr3", "class"]])

# ========= 保存 ========= #
vdjdb_df.to_csv(output_path, index=False)
print(f"\n✅ 已完成分类，文件保存至：{output_path}")


读取数据中...
提取 GPT CDR3（去除含O）...
处理 score = 0 的序列（计算编辑距离）...


快速计算编辑距离: 100%|████████████████████████████████████████████████████████████| 4299/4299 [01:34<00:00, 45.73it/s]



✅ 已完成分类，文件保存至：C:\Users\数学迷迷迷\Desktop\蛋白质\vdjdb_labeled.csv
