In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-v1.1")

In [None]:
import pandas as pd

input_file = "HNSC_patho.csv"
output_file = "/HNSC_embeddings.csv"
df = pd.read_csv(input_file)

In [None]:
import numpy as np

text_column = "text"
if text_column not in df.columns:
    raise ValueError(f"'{text_column}' is not in text.")

def get_embedding_for_text(text):
    max_length = 512
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    with torch.no_grad():
        outputs = model(**inputs)
        sentence_embedding = outputs.pooler_output.squeeze().numpy()
    return sentence_embedding

embeddings = []

for i, text in enumerate(df[text_column]):
    if pd.isna(text):
        embeddings.append([None] * 768)
        continue

    if len(tokenizer.tokenize(text)) > 512:
        tokens = tokenizer.tokenize(text)
        chunks = [tokens[i:i + 512] for i in range(0, len(tokens), 512)]  # 分块
        chunk_embeddings = []
        for chunk in chunks:
            chunk_text = tokenizer.convert_tokens_to_string(chunk)
            chunk_embedding = get_embedding_for_text(chunk_text)
            chunk_embeddings.append(chunk_embedding)
        sentence_embedding = np.mean(chunk_embeddings, axis=0)
    else:
        sentence_embedding = get_embedding_for_text(text)

    embeddings.append(sentence_embedding)

    if (i + 1) % 10 == 0:
        print(f"Processing {i + 1}/{len(df)} text")

embedding_columns = [f"embedding_{i}" for i in range(768)]
embedding_df = pd.DataFrame(embeddings, columns=embedding_columns)

In [None]:
embedding_columns = [f"embedding_{i}" for i in range(768)]
embedding_df = pd.DataFrame(embeddings, columns=embedding_columns)
result_df = pd.concat([df, embedding_df], axis=1)
result_df.to_csv(output_file, index=False)
print(f"Saved {output_file}")