In [1]:
from datasets import Dataset, DatasetDict, load_dataset

from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments
from sentence_transformers.training_args import MultiDatasetBatchSamplers
from sentence_transformers.losses import SoftmaxLoss, MultipleNegativesRankingLoss, TripletLoss
from sentence_transformers.evaluation import TripletEvaluator, BinaryClassificationEvaluator, SequentialEvaluator
from transformers import EarlyStoppingCallback
from sentence_transformers import evaluation
import pandas as pd
import numpy as np
import random

2025-05-30 13:32:19.340794: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748611939.553325      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748611939.617612      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
df_train = pd.read_csv("/kaggle/input/data-uit/train.csv")
df_val = pd.read_csv("/kaggle/input/data-uit/val.csv")
df_test = pd.read_csv("/kaggle/input/data-uit/test.csv")

In [3]:
columns_to_keep = ['index','question', 'context']
df_train = df_train[columns_to_keep]
df_val = df_val[columns_to_keep]
df_test = df_test[columns_to_keep]

In [4]:
df_train = df_train.rename(columns={'question': 'query'})
df_val = df_val.rename(columns={'question': 'query'})
df_test = df_test.rename(columns={'question': 'query'})

In [5]:
df_train['label'] = 1
df_val['label'] = 1
df_test['label'] = 1

In [6]:
def create_neg(df):
    neg = df.copy()
    neg["context"] = neg["context"].sample(frac=1, random_state=42).reset_index(drop=True)
    neg["label"] = 0
    return neg
neg_train = create_neg(df_train)
neg_val = create_neg(df_val)
neg_test = create_neg(df_test)

In [7]:
def make_pair(df,neg):
    pair_df = pd.concat([df, neg]).sample(frac=1, random_state=42).reset_index(drop=True)
    return pair_df
pair_train = make_pair(df_train,neg_train) 
pair_val = make_pair(df_val,neg_val)
pair_test = make_pair(df_test,neg_test)

In [8]:
def gen_triplets(df, n):
    trip = []
    queries = df["query"].unique().tolist()
    for _ in range(n):
        q = random.choice(queries)
        pos_pool = df[(df["query"]==q)&(df["label"]==1)]["context"].tolist()
        neg_pool = df[(df["query"]==q)&(df["label"]==0)]["context"].tolist()
        if not pos_pool or not neg_pool: continue
        trip.append({
            "anchor": q,
            "positive": random.choice(pos_pool),
            "negative": random.choice(neg_pool)
        })
    return pd.DataFrame(trip)

trip_df_train = gen_triplets(pair_train, n = 5000)
trip_df_val = gen_triplets(pair_val,n = 1500)
trip_df_test = gen_triplets(pair_test,n = 1500)

In [9]:
p_train = pair_train
p_valid = pair_val
p_test = pair_test

t_train = trip_df_train
t_valid = trip_df_val
t_test = trip_df_test

In [10]:
def to_pair(ex):
    return {"question": ex["query"],"context": ex["context"], "label": ex["label"]}

def to_triplet(ex):
    return {"anchor": ex["anchor"],"positive": ex["positive"], "negative": ex["negative"]}

In [11]:
pair_ds = DatasetDict({
    "train": Dataset.from_pandas(p_train.reset_index(drop=True)).map(to_pair, remove_columns=list(p_train.columns)),
    "validation": Dataset.from_pandas(p_valid.reset_index(drop=True)).map(to_pair, remove_columns=list(p_valid.columns)),
    "test": Dataset.from_pandas(p_test.reset_index(drop=True)).map(to_pair, remove_columns=list(p_test.columns)),
})

trip_ds = DatasetDict({
    "train": Dataset.from_pandas(t_train.reset_index(drop=True)).map(to_triplet, remove_columns=list(t_train.columns)),
    "validation": Dataset.from_pandas(t_valid.reset_index(drop=True)).map(to_triplet, remove_columns=list(t_valid.columns)),
    "test": Dataset.from_pandas(t_test.reset_index(drop=True)).map(to_triplet, remove_columns=list(t_test.columns)),
})

Map:   0%|          | 0/15612 [00:00<?, ? examples/s]

Map:   0%|          | 0/1952 [00:00<?, ? examples/s]

Map:   0%|          | 0/1952 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [12]:
train_ds = {"pair-class": pair_ds["train"], "triplet": trip_ds["train"]}
valid_ds = {"pair-class": pair_ds["validation"], "triplet": trip_ds["validation"]}
test_ds = {"pair-class": pair_ds["test"], "triplet": trip_ds["test"]}

In [13]:
from sentence_transformers import models, SentenceTransformer

# 7. Tạo mô hình và losses
#transformer = models.Transformer("hiieu/halong_embedding", max_seq_length=512)
transformer = models.Transformer("VoVanPhuc/sup-SimCSE-VietNamese-phobert-base")
pooling    = models.Pooling(transformer.get_word_embedding_dimension(), pooling_mode="mean")
model      = SentenceTransformer(modules=[transformer, pooling])

config.json:   0%|          | 0.00/731 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/17.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [14]:
#Ham loss
pair_loss    = SoftmaxLoss(model, model.get_sentence_embedding_dimension(), num_labels=2)
triplet_loss = TripletLoss(model)

In [15]:
pair_eval = BinaryClassificationEvaluator(
    sentences1=[x[0] for x in pair_ds["validation"]["question"]],
    sentences2=[x[1] for x in pair_ds["validation"]["context"]],
    labels=pair_ds["validation"]["label"]
)

triplet_eval = TripletEvaluator(
    anchors=[x[0] for x in trip_ds["validation"]["anchor"]],
    positives=[x[1] for x in trip_ds["validation"]["positive"]],
    negatives=[x[2] for x in trip_ds["validation"]["negative"]]
)

dev_evaluator = SequentialEvaluator([triplet_eval, pair_eval], main_score_function=lambda scores: np.average(scores))

In [16]:
dev_evaluator(model)

{'cosine_accuracy': 0.5102459016393442,
 'cosine_accuracy_threshold': 0.6714676022529602,
 'cosine_f1': 0.6671246994160082,
 'cosine_f1_threshold': 0.5638657808303833,
 'cosine_precision': 0.5018087855297158,
 'cosine_recall': 0.9948770491803278,
 'cosine_ap': 0.5018862116043976,
 'cosine_mcc': 0.0,
 'sequential_score': 0.6569430941196884}

In [17]:
training_args = SentenceTransformerTrainingArguments(
    output_dir="/kaggle/working/",
    # Optional training parameters:
    num_train_epochs=10,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True, 
    multi_dataset_batch_sampler=MultiDatasetBatchSamplers.PROPORTIONAL,
    
    # Optional tracking/debugging parameters:
    eval_strategy="epoch",
    # eval_steps=100,
    save_strategy="epoch",
    # save_steps=100,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_sequential_score",  
    greater_is_better=True,
    logging_dir="logs",
    logging_strategy="epoch",
    # logging_steps=100,
    report_to="none" 
)


In [18]:
import torch

# Giải phóng bộ nhớ GPU
torch.cuda.empty_cache()

trainer = SentenceTransformerTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    loss={"pair-class": pair_loss, "triplet": triplet_loss},
    evaluator=dev_evaluator,
)

Computing widget examples:   0%|          | 0/2 [00:00<?, ?example/s]

In [19]:
# Sau đó bắt đầu huấn luyện
trainer.train()

Epoch,Training Loss,Validation Loss,Pair-class Loss,Cosine Accuracy,Cosine Accuracy Threshold,Cosine F1,Cosine F1 Threshold,Cosine Precision,Cosine Recall,Cosine Ap,Cosine Mcc,Sequential Score,Triplet Loss
1,0.767,No log,0.24514,0.507684,0.795062,0.667354,0.582142,0.502068,0.994877,0.500434,0.00857,0.73055,0.762187
2,0.2615,No log,0.201794,0.509734,0.829372,0.66804,0.615403,0.502323,0.996926,0.501251,0.022657,0.665959,0.503137
3,0.1463,No log,0.227539,0.508709,0.845958,0.667354,0.629425,0.502068,0.994877,0.501878,0.00857,0.669272,0.423439
4,0.0936,No log,0.255463,0.511783,0.803643,0.66804,0.546577,0.502323,0.996926,0.503565,0.022657,0.726116,0.408891
5,0.0658,No log,0.278904,0.509221,0.823897,0.66804,0.624815,0.502323,0.996926,0.501425,0.022657,0.730379,0.391772
6,0.0538,No log,0.274232,0.513832,0.849971,0.66804,0.640619,0.502323,0.996926,0.504413,0.022657,0.725206,0.359453
7,0.041,No log,0.293501,0.512807,0.821438,0.66804,0.579749,0.502323,0.996926,0.504448,0.022657,0.721224,0.338907
8,0.0327,No log,0.30852,0.510758,0.834476,0.66804,0.607599,0.502323,0.996926,0.503599,0.022657,0.721133,0.351376
9,0.0308,No log,0.31414,0.512807,0.838708,0.66804,0.632259,0.502323,0.996926,0.50325,0.022657,0.720625,0.35721
10,0.0268,No log,0.316328,0.514857,0.845438,0.66804,0.649025,0.502323,0.996926,0.503805,0.022657,0.720902,0.349036


TrainOutput(global_step=8600, training_loss=0.15192189504933912, metrics={'train_runtime': 10330.7672, 'train_samples_per_second': 19.952, 'train_steps_per_second': 0.832, 'total_flos': 0.0, 'train_loss': 0.15192189504933912, 'epoch': 10.0})

In [20]:
pair_class_test_evaluator = evaluation.BinaryClassificationEvaluator(
    sentences1=pair_ds["test"]["question"],
    sentences2=pair_ds["test"]["context"],
    labels=pair_ds["test"]["label"]
)

triplet_test_evaluator = evaluation.TripletEvaluator(
    anchors=trip_ds["test"]["anchor"],
    positives=trip_ds["test"]["positive"],
    negatives=trip_ds["test"]["negative"]
)

In [21]:
# Kết hợp các bộ đánh giá
test_evaluator = SequentialEvaluator([pair_class_test_evaluator, triplet_test_evaluator], main_score_function=lambda scores: np.mean(scores))

# Thực hiện đánh giá
test_evaluator(model)

{'cosine_accuracy': 0.9613333344459534,
 'cosine_accuracy_threshold': 0.5608683824539185,
 'cosine_f1': 0.9104704097116844,
 'cosine_f1_threshold': 0.5594302415847778,
 'cosine_precision': 0.8991008991008991,
 'cosine_recall': 0.9221311475409836,
 'cosine_ap': 0.9590718826021234,
 'cosine_mcc': 0.8189162369350828,
 'sequential_score': 0.9602026085240384}

In [22]:
model.save("/kaggle/working/sup_SimCSE_VietNamese_phobert_base")