In [None]:
#安装transformers库
!pip install transformers accelerate scikit-learn numpy pandas openpyxl nltk

In [1]:
import torch

print(torch.cuda.is_available())

True


Load the tokenizer:

In [None]:
import os

data_disk_dir = "autodl-tmp"  # 数据盘dir（云GPU平台）
model_name = "nllb-200-distilled-600M"
model_dir = os.path.join(data_disk_dir, model_name)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_dir, tgt_lang=None)

Prepare your training and validation data:

In [None]:
data_dir = "data/027.한국어-중국어_번역_말뭉치_1/01.데이터"
csv_train = os.path.join(data_dir, "1.Training/원천데이터/ko2zh_patent_1_training.csv")
csv_valid = os.path.join(data_dir, "2.Validation/원천데이터/ko2zh_patent_2_validation.csv")

In [None]:
rename_dict = {"중국어": "zho_Hans", "한국어": "kor_Hang"}

In [None]:
import pandas as pd

#load Training Data
df_train = pd.read_csv(csv_train)[rename_dict.keys()]
df_train.rename(columns=rename_dict, inplace=True)  # rename the columns

#load Validation Data
df_valid = pd.read_csv(csv_valid)[rename_dict.keys()]
df_valid.rename(columns=rename_dict, inplace=True)  # rename the columns

In [None]:
df_train

In [None]:
df_valid

find the max length of the encodings

In [None]:
from sklearn.model_selection import train_test_split

language_x = "zho_Hans"
language_y = "kor_Hang"
x_train = df_train[language_x].to_list()
y_train = df_train[language_y].to_list()
x_valid = df_valid[language_x].to_list()
y_valid = df_valid[language_y].to_list()

# 从训练集中抽出测试集，规模与验证集一致
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=len(x_valid), random_state=42)

In [None]:
# 保存到 CSV 文件
import pandas as pd

# 创建 DataFrame
train_df = pd.DataFrame({language_x: x_train, language_y: y_train})
valid_df = pd.DataFrame({language_x: x_valid, language_y: y_valid})
test_df = pd.DataFrame({language_x: x_test, language_y: y_test})

train_df.to_csv('data/train_data.csv', index=False)
valid_df.to_csv('data/valid_data.csv', index=False)
test_df.to_csv('data/test_data.csv', index=False)

Convert your encodings into torch Datasets object:

In [None]:
import numpy as np
from tqdm.notebook import tqdm

# 假设你已经有所有长度的列表
lengths = []

texts_all = x_train + y_train + x_valid + y_valid + x_test + y_test
for text in tqdm(texts_all):
    tokenized_text = tokenizer(text, truncation=False, padding=False)
    lengths.append(len(tokenized_text['input_ids']))

# 使用95%分位数来确定max_length
max_length = int(np.percentile(lengths, 95))
print(f"The 95% percentile length in the dataset is: {max_length}")

tokenizer.max_length = max_length

In [None]:
# Note: we're now creating separate encodings for the inputs and outputs.
# truncation: truncate the sequence to a shorter length, because sometimes a sequence may be too long for a model to handle
# padding: Padding is a strategy for ensuring tensors are rectangular by adding a special padding token to shorter sentences.
#     True or 'longest': Pad to the longest sequence in the batch (or no padding if only a single sequence if provided).
#     'max_length': Pad to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided.
#     False or 'do_not_pad' (default): No padding (i.e., can output a batch with sequences of different lengths).
# return_tensors: If set 'pt', will return tensors instead of list of python integers. Acceptable values are PyTorch torch.Tensor objects.
# max_length (int, optional): Controls the maximum length to use by one of the truncation/padding parameters.
tokenizer.src_lang = language_x
tokenizer.tgt_lang = language_y
print("tokenizing - source:{}, target:{}".format(language_x, language_y))
print("train encodings...")
train_encodings = tokenizer(x_train, text_target=y_train, truncation=True, padding="max_length", return_tensors="pt")
print("valid encodings...")
valid_encodings = tokenizer(x_valid, text_target=y_valid, truncation=True, padding="max_length", return_tensors="pt")
print("test encodings...")
test_encodings = tokenizer(x_test, text_target=y_test, truncation=True, padding="max_length", return_tensors="pt")

In [None]:
import torch


class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, index):
        item = {key: val[index] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
train_dataset = TranslationDataset(train_encodings)
valid_dataset = TranslationDataset(valid_encodings)
test_dataset = TranslationDataset(test_encodings)

Load the pretrained model:

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_dir, device_map="auto")

Define your training arguments and train the model:

In [None]:
new_model_name = "zh2ko_based_on_{0}".format(model_name)

new_model_dir = os.path.join(data_disk_dir, new_model_name)

In [None]:
from transformers import Trainer, TrainingArguments, IntervalStrategy

# 每1/5 Epoch处验证、记录、保存一次
log_steps = len(x_train) / 5

# fp16：半精度运算，启用后提高一倍以上运算速度，不影响loss
# gradient_accumulation_steps：steps越大，速度越快，loss越高
# gradient_checkpointing：启用后，降低30%左右速度，节省显存2/3
# per_device_train_batch_size：size越大，GPU占用率越大，速度越快，loss越高，几乎成正比
training_args = TrainingArguments(new_model_dir,
                                  num_train_epochs=10,
                                  per_device_eval_batch_size=1,
                                  per_device_train_batch_size=1,
                                  gradient_accumulation_steps=1,
                                  gradient_checkpointing=False,
                                  fp16=True,
                                  warmup_ratio=0.1,
                                  eval_strategy=IntervalStrategy.STEPS,
                                  eval_steps=log_steps,
                                  logging_strategy=IntervalStrategy.STEPS,
                                  logging_steps=log_steps,
                                  save_strategy=IntervalStrategy.STEPS,
                                  save_steps=log_steps,
                                  save_total_limit=1,
                                  load_best_model_at_end=True
                                  )

In [None]:
from transformers import EarlyStoppingCallback


# ！！！猜测Transformer的原生Trainer存在显存泄露的问题，根据论坛上的神贴，做出如下修改：
# https://discuss.huggingface.co/t/cuda-out-of-memory-when-using-trainer-with-compute-metrics/2941/13
def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak. 
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels


trainer = Trainer(
    model=model,  # the instantiated 🤗 Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=train_dataset,  # training dataset
    eval_dataset=valid_dataset,  # valid dataset
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

In [None]:
# 在测试数据集上进行预测
test_results = trainer.predict(test_dataset)

# 获取预测结果
predictions = test_results.predictions
labels = test_results.label_ids
metrics = test_results.metrics

In [None]:
# 输出评估指标
metrics

In [None]:
# 解码
target_texts_array, references_array = predictions
target_texts = [tokenizer.decode(encoding, skip_special_tokens=True) for encoding in target_texts_array]
references = [tokenizer.decode(encoding, skip_special_tokens=True) for encoding in references_array]

In [None]:
source_texts = x_test

df_result = pd.DataFrame({
    "source_texts": source_texts,
    "target_texts": target_texts,
    "references": references,
})
df_result

In [None]:
file_name = "results_of_{0}".format(model_name)
df_result.to_excel("{0}.xlsx".format(file_name), index=False)
df_result.to_csv("{0}.csv".format(file_name), index=False)

In [None]:
from konlpy.tag import Okt
# from konlpy.tag import Mecab # Warning:KoNLPy’s Mecab() class is not supported on Windows machines.
from nltk.translate.bleu_score import corpus_bleu

# corpus_bleu 的结果更稳健，因为它是基于整个语料库的平均分数计算的，而 sentence_bleu 仅基于单个句子。
# 注：nltk3.8.1和python 3.12有点问题，要么降低python版本，要么按照以下链接的指导去修改bleu_score.py文件
# https://github.com/nltk/nltk/pull/3207
# https://github.com/nltk/nltk/blob/develop/nltk/translate/bleu_score.py
references_okt = [[Okt().morphs(reference)] for reference in references]
target_texts_okt = [Okt().morphs(target_text) for target_text in target_texts]
score = corpus_bleu(references_okt, target_texts_okt)
score

In [None]:
# trainer.train(resume_from_checkpoint=True)
# trainer.train(resume_from_checkpoint="autodl-tmp/zh2ko_based_on_nllb-200-distilled-600M/checkpoint-315000")
trainer.train(resume_from_checkpoint=False)

Save your fine-tuned model and tokenizer:

In [None]:
trainer.save_model(new_model_dir)
tokenizer.save_pretrained(new_model_dir)
trainer.save_state()

In [None]:
# 在测试数据集上进行预测
test_results = trainer.predict(test_dataset)

# 获取预测结果
predictions = test_results.predictions
labels = test_results.label_ids
metrics = test_results.metrics

In [None]:
# 输出评估指标
metrics

In [None]:
target_texts_array, references_array = predictions

# 对两个数组中的每一个编码进行解码
target_texts = [tokenizer.decode(encoding, skip_special_tokens=True) for encoding in target_texts_array]
references = [tokenizer.decode(encoding, skip_special_tokens=True) for encoding in references_array]

In [None]:
source_texts = x_test

df_result = pd.DataFrame({
    "source_texts": source_texts,
    "target_texts": target_texts,
    "references": references,
})
df_result

In [None]:
file_name = "results_of_{0}".format(new_model_name)
df_result.to_excel("{0}.xlsx".format(file_name), index=False)
df_result.to_csv("{0}.csv".format(file_name), index=False)

In [None]:
from konlpy.tag import Okt
# from konlpy.tag import Mecab # Warning:KoNLPy’s Mecab() class is not supported on Windows machines.
from nltk.translate.bleu_score import corpus_bleu

# corpus_bleu 的结果更稳健，因为它是基于整个语料库的平均分数计算的，而 sentence_bleu 仅基于单个句子。
# 注：nltk3.8.1和python 3.12有点问题，要么降低python版本，要么按照以下链接的指导去修改bleu_score.py文件
# https://github.com/nltk/nltk/pull/3207
# https://github.com/nltk/nltk/blob/develop/nltk/translate/bleu_score.py
references_okt = [[Okt().morphs(reference)] for reference in references]
target_texts_okt = [Okt().morphs(target_text) for target_text in target_texts]
score = corpus_bleu(references_okt, target_texts_okt)
score