In [4]:
import tkinter as tk
from tkinter import filedialog
import pandas as pd
import numpy as np
import esm
import torch
import os


def esm_embed(sequence_list):
    model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
    batch_converter = alphabet.get_batch_converter()
    model.eval()
    
    peptide_sequence_list = []
    for seq in sequence_list:
        format_seq = [seq, seq]
        tuple_sequence = tuple(format_seq)
        peptide_sequence_list.append(tuple_sequence)
    
    data = peptide_sequence_list
    batch_labels, batch_strs, batch_tokens = batch_converter(data)
    batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)
    
    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[6], return_contacts=True)
    token_representations = results["representations"][6]
    
    sequence_representations = []
    for i, token_len in enumerate(batch_lens):
        each_seq_rep = token_representations[i, 1:token_len - 1].mean(0).tolist()  
        sequence_representations.append(each_seq_rep)

    return pd.DataFrame(sequence_representations)


def open_file():
    file_path = filedialog.askopenfilename(filetypes=[("Excel Files", "*.xlsx")])
    file_path_entry.delete(0, tk.END)
    file_path_entry.insert(tk.END, file_path)


def process_file():
    file_path = file_path_entry.get()
    output_file_path = os.path.splitext(file_path)[0] + "_embedding.xlsx"
    
    df = pd.read_excel(file_path)
    sequences = df["Sequence"].tolist()
    
    embedding_df = esm_embed(sequences)
    
    embedding_df.to_excel(output_file_path, index=False)
    
    result_text.delete(1.0, tk.END)
    result_text.insert(tk.END, f"嵌入结果已保存到 {output_file_path}")


def process_input_sequence():
    input_sequence = input_sequence_entry.get()
    
    embedding_df = esm_embed([input_sequence])
    
    embedding_df.to_excel("embedding_result.xlsx", index=False)
    
    result_text.delete(1.0, tk.END)
    result_text.insert(tk.END, "嵌入结果已保存到 embedding_result.xlsx")


window = tk.Tk()

# 选择文件部分
file_path_label = tk.Label(window, text="选择文件:")
file_path_label.pack()
file_path_entry = tk.Entry(window)
file_path_entry.pack()
file_path_button = tk.Button(window, text="选择文件", command=open_file)
file_path_button.pack()

# 处理文件部分
process_file_button = tk.Button(window, text="处理文件", command=process_file)
process_file_button.pack()

# 输入序列部分
input_sequence_label = tk.Label(window, text="输入序列:")
input_sequence_label.pack()
input_sequence_entry = tk.Entry(window)
input_sequence_entry.pack()
process_sequence_button = tk.Button(window, text="处理序列", command=process_input_sequence)
process_sequence_button.pack()

# 结果显示部分
result_label = tk.Label(window, text="处理结果:")
result_label.pack()
result_text = tk.Text(window)
result_text.pack()

window.mainloop()