In [13]:
from sentence_transformers import SentenceTransformer, losses, SentencesDataset, InputExample
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import csv


In [2]:
# Step 1: 读取原始句子
with open('./dataset/syn.txt', 'r', encoding='utf-8') as f:
    sentences = f.readlines()


In [3]:
# Step 2: 替换"MCS"为"服务器", "mc服", "服"
synonym_replacements = ['服', 'mc服']

In [4]:
# Step 3: 创建同义句对
pairs = []
for sentence in sentences:
    sentence = sentence.strip()  # 去掉多余的换行符和空格
    for replacement in synonym_replacements:
        # 替换"MCS"并生成同义句对
        orig = sentence.replace("MCS", "服务器")
        modified_sentence = sentence.replace("MCS", replacement)
        pairs.append((orig, modified_sentence))


In [None]:
# Step 4: 保存同义句对
with open('./dataset/synonym_pairs.txt', 'w', encoding='utf-8') as f:
    for pair in pairs:
        f.write(f"{pair[0]}\t{pair[1]}\n")


In [None]:
# Step 5: 加载 SentenceTransformer 模型
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')


In [16]:
# Step 6: 准备训练数据
positive_examples = [InputExample(texts=[pair[0], pair[1]], label=1.0) for pair in pairs]
negative_examples = []
with open('./dataset/train_neg.csv', mode='r', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader)  # 跳过表头
    for row in reader:
        negative_examples.append(InputExample(texts=[row[0], row[1]], label=float(row[2])))

In [17]:

train_dataset = SentencesDataset(positive_examples + negative_examples, model)


In [18]:
# Step 7: 将训练数据划分为训练集和验证集
train_data, val_data = train_test_split(train_dataset, test_size=0.2)


In [22]:
# Step 8: 使用 SentenceTransformer 进行 finetune
# 这里使用训练数据进行 fine-tuning，可以根据需要调整训练参数
train_dataloader = DataLoader(train_data, batch_size=16, shuffle=True)
train_loss = losses.MultipleNegativesRankingLoss(model)

model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=4, warmup_steps=100, show_progress_bar=True)


  0%|          | 0/28 [00:00<?, ?it/s]

{'train_runtime': 24.772, 'train_samples_per_second': 16.147, 'train_steps_per_second': 1.13, 'train_loss': 0.30372258595057894, 'epoch': 4.0}


In [24]:

# 保存 fine-tuned 模型
model.save('finetuned_paraphrase-multilingual-MiniLM-L12-v2')