In [1]:
from datasets import Dataset, DatasetDict, load_dataset

from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments
from sentence_transformers.training_args import MultiDatasetBatchSamplers
from sentence_transformers.losses import SoftmaxLoss, MultipleNegativesRankingLoss, TripletLoss
from sentence_transformers.evaluation import TripletEvaluator, BinaryClassificationEvaluator, SequentialEvaluator
from transformers import EarlyStoppingCallback
from sentence_transformers import evaluation
import pandas as pd
import numpy as np
import random

2025-05-23 03:27:52.407745: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747970872.607077      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747970872.667800      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
df_train = pd.read_csv("/kaggle/input/data-uit/train.csv")
df_val = pd.read_csv("/kaggle/input/data-uit/val.csv")
df_test = pd.read_csv("/kaggle/input/data-uit/test.csv")

In [3]:
columns_to_keep = ['index','question', 'context']
df_train = df_train[columns_to_keep]
df_val = df_val[columns_to_keep]
df_test = df_test[columns_to_keep]

In [4]:
df_train = df_train.rename(columns={'question': 'query'})
df_val = df_val.rename(columns={'question': 'query'})
df_test = df_test.rename(columns={'question': 'query'})

In [5]:
df_train['label'] = 1
df_val['label'] = 1
df_test['label'] = 1

In [6]:
def create_neg(df):
    neg = df.copy()
    neg["context"] = neg["context"].sample(frac=1, random_state=42).reset_index(drop=True)
    neg["label"] = 0
    return neg
neg_train = create_neg(df_train)
neg_val = create_neg(df_val)
neg_test = create_neg(df_test)

In [7]:
def make_pair(df,neg):
    pair_df = pd.concat([df, neg]).sample(frac=1, random_state=42).reset_index(drop=True)
    return pair_df
pair_train = make_pair(df_train,neg_train) 
pair_val = make_pair(df_val,neg_val)
pair_test = make_pair(df_test,neg_test)

In [8]:
def gen_triplets(df, n):
    trip = []
    queries = df["query"].unique().tolist()
    for _ in range(n):
        q = random.choice(queries)
        pos_pool = df[(df["query"]==q)&(df["label"]==1)]["context"].tolist()
        neg_pool = df[(df["query"]==q)&(df["label"]==0)]["context"].tolist()
        if not pos_pool or not neg_pool: continue
        trip.append({
            "anchor": q,
            "positive": random.choice(pos_pool),
            "negative": random.choice(neg_pool)
        })
    return pd.DataFrame(trip)

trip_df_train = gen_triplets(pair_train, n = 5000)
trip_df_val = gen_triplets(pair_val,n = 1500)
trip_df_test = gen_triplets(pair_test,n = 1500)

In [9]:
p_train = pair_train
p_valid = pair_val
p_test = pair_test

t_train = trip_df_train
t_valid = trip_df_val
t_test = trip_df_test

In [10]:
def to_pair(ex):
    return {"question": ex["query"],"context": ex["context"], "label": ex["label"]}

def to_triplet(ex):
    return {"anchor": ex["anchor"],"positive": ex["positive"], "negative": ex["negative"]}

In [11]:
pair_ds = DatasetDict({
    "train": Dataset.from_pandas(p_train.reset_index(drop=True)).map(to_pair, remove_columns=list(p_train.columns)),
    "validation": Dataset.from_pandas(p_valid.reset_index(drop=True)).map(to_pair, remove_columns=list(p_valid.columns)),
    "test": Dataset.from_pandas(p_test.reset_index(drop=True)).map(to_pair, remove_columns=list(p_test.columns)),
})

trip_ds = DatasetDict({
    "train": Dataset.from_pandas(t_train.reset_index(drop=True)).map(to_triplet, remove_columns=list(t_train.columns)),
    "validation": Dataset.from_pandas(t_valid.reset_index(drop=True)).map(to_triplet, remove_columns=list(t_valid.columns)),
    "test": Dataset.from_pandas(t_test.reset_index(drop=True)).map(to_triplet, remove_columns=list(t_test.columns)),
})

Map:   0%|          | 0/15612 [00:00<?, ? examples/s]

Map:   0%|          | 0/1952 [00:00<?, ? examples/s]

Map:   0%|          | 0/1952 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [12]:
train_ds = {"pair-class": pair_ds["train"], "triplet": trip_ds["train"]}
valid_ds = {"pair-class": pair_ds["validation"], "triplet": trip_ds["validation"]}
test_ds = {"pair-class": pair_ds["test"], "triplet": trip_ds["test"]}

In [13]:
from sentence_transformers import models, SentenceTransformer

# 7. Tạo mô hình và losses
transformer = models.Transformer("hiieu/halong_embedding", max_seq_length=512)
pooling    = models.Pooling(transformer.get_word_embedding_dimension(), pooling_mode="mean")
model      = SentenceTransformer(modules=[transformer, pooling])

config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

In [14]:
#Ham loss
pair_loss    = SoftmaxLoss(model, model.get_sentence_embedding_dimension(), num_labels=2)
triplet_loss = TripletLoss(model)

In [15]:
pair_eval = BinaryClassificationEvaluator(
    sentences1=[x[0] for x in pair_ds["validation"]["question"]],
    sentences2=[x[1] for x in pair_ds["validation"]["context"]],
    labels=pair_ds["validation"]["label"]
)

triplet_eval = TripletEvaluator(
    anchors=[x[0] for x in trip_ds["validation"]["anchor"]],
    positives=[x[1] for x in trip_ds["validation"]["positive"]],
    negatives=[x[2] for x in trip_ds["validation"]["negative"]]
)

dev_evaluator = SequentialEvaluator([triplet_eval, pair_eval], main_score_function=lambda scores: np.average(scores))

In [16]:
dev_evaluator(model)

{'cosine_accuracy': 0.5097336065573771,
 'cosine_accuracy_threshold': 0.3784555196762085,
 'cosine_f1': 0.6662111376836352,
 'cosine_f1_threshold': 0.16791117191314697,
 'cosine_precision': 0.49974372116863147,
 'cosine_recall': 0.9989754098360656,
 'cosine_ap': 0.49724094850971895,
 'cosine_mcc': 0.0,
 'sequential_score': 0.6676204740164409}

In [17]:
training_args = SentenceTransformerTrainingArguments(
    output_dir="/kaggle/working/",
    # Optional training parameters:
    num_train_epochs=10,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True, 
    multi_dataset_batch_sampler=MultiDatasetBatchSamplers.PROPORTIONAL,
    
    # Optional tracking/debugging parameters:
    eval_strategy="epoch",
    # eval_steps=100,
    save_strategy="epoch",
    # save_steps=100,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_sequential_score",  
    greater_is_better=True,
    logging_dir="logs",
    logging_strategy="epoch",
    # logging_steps=100,
    report_to="none" 
)


In [18]:
import torch

# Giải phóng bộ nhớ GPU
torch.cuda.empty_cache()

trainer = SentenceTransformerTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    loss={"pair-class": pair_loss, "triplet": triplet_loss},
    evaluator=dev_evaluator,
)

Computing widget examples:   0%|          | 0/2 [00:00<?, ?example/s]

In [19]:
# Sau đó bắt đầu huấn luyện
trainer.train()

Epoch,Training Loss,Validation Loss,Pair-class Loss,Cosine Accuracy,Cosine Accuracy Threshold,Cosine F1,Cosine F1 Threshold,Cosine Precision,Cosine Recall,Cosine Ap,Cosine Mcc,Sequential Score,Triplet Loss
1,0.5233,No log,0.152314,0.506148,0.583757,0.667353,0.568939,0.501807,0.995902,0.495703,0.016038,0.668518,0.342888
2,0.1567,No log,0.151229,0.509734,0.622977,0.667122,0.570631,0.500513,1.0,0.497272,0.032026,0.674969,0.253336
3,0.0962,No log,0.237978,0.509734,0.749356,0.66735,0.655094,0.50077,1.0,0.497201,0.032026,0.674934,0.250054
4,0.0669,No log,0.260599,0.507172,0.780297,0.667122,0.740991,0.500513,1.0,0.496201,0.032026,0.669434,0.232314
5,0.0514,No log,0.218919,0.510246,0.874047,0.667122,0.84311,0.500513,1.0,0.497027,0.032026,0.675513,0.277591
6,0.0527,No log,0.194033,0.512807,0.949363,0.667352,0.925783,0.501287,0.997951,0.501312,0.03781,0.676989,0.36343
7,0.0545,No log,0.193595,0.51127,0.962458,0.667352,0.943095,0.501287,0.997951,0.501465,0.03781,0.670732,0.401441
8,0.0474,No log,0.188638,0.51127,0.970189,0.667807,0.949358,0.501284,1.0,0.501629,0.050676,0.624148,0.421208
9,0.0461,No log,0.206819,0.509734,0.973071,0.66735,0.95101,0.50077,1.0,0.500782,0.039233,0.623724,0.41593
10,0.047,No log,0.206189,0.509734,0.975887,0.66735,0.954876,0.50077,1.0,0.50114,0.039233,0.622903,0.429725


TrainOutput(global_step=8600, training_loss=0.11421745921290198, metrics={'train_runtime': 20617.9051, 'train_samples_per_second': 9.997, 'train_steps_per_second': 0.417, 'total_flos': 0.0, 'train_loss': 0.11421745921290198, 'epoch': 10.0})

In [20]:
pair_class_test_evaluator = evaluation.BinaryClassificationEvaluator(
    sentences1=pair_ds["test"]["question"],
    sentences2=pair_ds["test"]["context"],
    labels=pair_ds["test"]["label"]
)

triplet_test_evaluator = evaluation.TripletEvaluator(
    anchors=trip_ds["test"]["anchor"],
    positives=trip_ds["test"]["positive"],
    negatives=trip_ds["test"]["negative"]
)

In [21]:
# Kết hợp các bộ đánh giá
test_evaluator = SequentialEvaluator([pair_class_test_evaluator, triplet_test_evaluator], main_score_function=lambda scores: np.mean(scores))

# Thực hiện đánh giá
test_evaluator(model)

{'cosine_accuracy': 0.9793333411216736,
 'cosine_accuracy_threshold': 0.9052469730377197,
 'cosine_f1': 0.9606942317508933,
 'cosine_f1_threshold': 0.9049763083457947,
 'cosine_precision': 0.9572736520854527,
 'cosine_recall': 0.9641393442622951,
 'cosine_ap': 0.985078284081654,
 'cosine_mcc': 0.9211302489054134,
 'sequential_score': 0.9822058126016637}

In [22]:
model.save("/kaggle/working/model_test")