In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, Dataset
from torch import nn
import torch
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch.optim as optim



  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df = pd.read_csv('../data/data_similarity.csv', usecols=["Describe_1", "Describe_2", "Similarity_score"])

In [None]:
df

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates()
print(df.duplicated().sum())
print(df.shape)

In [None]:
movie_positive = df[df['Similarity_score'] >= 0.7]
train_df_positive, temp_df_positive = train_test_split(movie_positive, test_size=0.2, random_state=42)
valid_df_positive, test_df_positive = train_test_split(temp_df_positive, test_size=0.5, random_state=42)


In [None]:
movie_negative = df[df['Similarity_score'] <= 0.3].sample(n=1000000, random_state=42)
train_df_negative, temp_df_negative = train_test_split(movie_negative, test_size=0.2, random_state=42)
valid_df_negative, test_df_negative = train_test_split(temp_df_negative, test_size=0.5, random_state=42)

In [None]:
movie_hard_negative = df[(df['Similarity_score'] > 0.3) & (df['Similarity_score'] < 0.7)].sample(n=1000000, random_state=42)
train_df_hard_negative, temp_df_hard_negative = train_test_split(movie_hard_negative, test_size=0.2, random_state=42)
valid_df_hard_negative, test_df_hard_negative = train_test_split(temp_df_hard_negative, test_size=0.5, random_state=42)

In [None]:
train_df = pd.concat([train_df_positive, train_df_hard_negative, train_df_negative])
valid_df = pd.concat([valid_df_positive, valid_df_hard_negative, valid_df_negative])
test_df = pd.concat([test_df_positive, test_df_hard_negative, test_df_negative])

train_df = train_df.sample(frac=1, random_state=42)
valid_df = valid_df.sample(frac=1, random_state=42)
test_df = test_df.sample(frac=1, random_state=42)
print(train_df.shape)
print(valid_df.shape)
print(test_df.shape)


In [None]:
train_df.to_parquet("../data/data_train.parquet", engine="fastparquet", index=False)
valid_df.to_parquet("../data/data_valid.parquet", engine="fastparquet", index=False)
test_df.to_parquet("../data/data_test.parquet", engine="fastparquet", index=False)

In [3]:
train_df = pd.read_parquet("../data/data_train.parquet")
valid_df = pd.read_parquet("../data/data_valid.parquet")
test_df = pd.read_parquet("../data/data_test.parquet")

In [4]:
train_df

Unnamed: 0,Describe_1,Describe_2,Similarity_score
0,Backstreet Rookie 2020u200f Choi DaeHyun ( Ji ...,Simple Man 2016 Khế Ước Của Quỷxa0Simple Man (...,0.39
1,Shin Masked Rider 2023 một bộ phim siêu anh hù...,Star Trek 9 Insurrection 1998 Trong chuyến hàn...,0.35
2,My Name Is Vendetta 2022 Sofia là một thiếu ni...,Eiga Sumairu Purikyua Ehon no naka wa minna ch...,0.41
3,Power Ranger Mighty Morphin The Movie 1995 Bộ ...,Dragon Ball Z Movie 11 Super Senshi Gekiha Kat...,0.43
4,Robot Sori Sori Voice From The Heart 2016 Cốt...,New Smiling Proud Wanderer 2018 Tân Tiếu Ngạo ...,0.24
...,...,...,...
1600620,The Sniper 2020 Đội An ninh Tiger của Công ty ...,Bond 21 Casino Royale 2006 Đặc vụ MI6 James B...,0.40
1600621,Doraemon the 40th Movie Nobitas New Dinosaur ...,Go go go 2013 Tỷ Tỷ Sông Phaxa0Go go go 2013 P...,0.21
1600622,Sòng Bạc 2022 Bộ phim xoay quanh một nhân vật ...,Tale of the Nine Tailed 2020u200f Truyện Cửu V...,0.44
1600623,Rich in Love 2 (Ricos de Amor 2) 2023 là một b...,Miranda Sings Live Your Welcome 2019 Ngôi sao ...,0.33


In [None]:
class PhoBertDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256):
        self.tokenizer = tokenizer
        self.sentences1 = df['Describe_1'].tolist()
        self.sentences2 = df['Describe_2'].tolist()
        self.similarity = df['Similarity_score'].tolist()
        self.max_length = max_length

    def __len__(self):
        return len(self.similarity)

    def __getitem__(self, idx):
        encoded = self.tokenizer(
            self.sentences1[idx], self.sentences2[idx],  # Ghép hai câu lại
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            "input_ids": encoded["input_ids"].squeeze(0),  # (batch_size, seq_len)
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "label": torch.tensor(self.similarity[idx], dtype=torch.float32)
        }


In [None]:
class WeightedSimilarityMSELoss(nn.Module):
    def __init__(self, num_epochs=5, max_weight=50, min_weight=3):
        super().__init__()
        self.loss_fct = nn.MSELoss(reduction='none')
        self.num_epochs = num_epochs
        self.max_weight = max_weight
        self.min_weight = min_weight
    def compute_weight(self, epoch):
        # Cách 1: Giảm tuyến tính
        return self.max_weight - (epoch / self.num_epochs) * (self.max_weight - self.min_weight)
    
    def forward(self, model, input_ids, attention_mask, labels, epoch):
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state[:, 0, :]  # Lấy embedding CLS

        embeddings1, embeddings2 = embeddings.chunk(2, dim=0)
        cos_sim = nn.functional.cosine_similarity(embeddings1, embeddings2, dim=-1)

        loss = self.loss_fct(cos_sim, labels)

        weights = torch.ones_like(labels).to(labels.device)
        current_weight = self.compute_weight(epoch)
        weights[labels <= 0.3] = 1.5
        weights[(labels > 0.3) & (labels < 0.7)] = 1.5
        weights[labels >= 0.7] = current_weight

        return (loss * weights).mean()


In [None]:
def evaluate_model(model, val_dataloader, device):
    model.eval()
    total_loss, total_mae, num_samples = 0

    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            embeddings = outputs.last_hidden_state[:, 0, :]

            embeddings1, embeddings2 = embeddings.chunk(2, dim=0)
            cos_sim = nn.functional.cosine_similarity(embeddings1, embeddings2, dim=-1)

            loss = nn.MSELoss()(cos_sim, labels)
            total_loss += loss.item() * labels.size(0)

            mae = torch.abs(cos_sim - labels).mean().item()
            total_mae += mae * labels.size(0)

            num_samples += labels.size(0)

    avg_loss = total_loss / num_samples
    avg_mae = total_mae / num_samples

    print(f"Validation Loss: {avg_loss:.2f} | Validation MAE: {avg_mae:.2f}")
    return avg_loss, avg_mae


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base')

train_data = PhoBertDataset(train_df, tokenizer)
val_data = PhoBertDataset(valid_df, tokenizer)

train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=32)

model = AutoModel.from_pretrained('vinai/phobert-base').to(device)
loss_pb = WeightedSimilarityMSELoss(num_epochs=5).to(device)

optimizer = optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
scaler = torch.cuda.amp.GradScaler()  # Mixed Precision

for epoch in range(5):
    model.train()
    epoch_iterator = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}")

    for batch in epoch_iterator:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            loss = loss_pb(model, input_ids, attention_mask, labels, epoch)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        epoch_iterator.set_postfix(loss=loss.item())

    # Đánh giá sau mỗi epoch
    val_loss, val_mae = evaluate_model(model, val_dataloader, device)


In [None]:
model.save_pretrained("fine_tuned_phobert")
tokenizer.save_pretrained("fine_tuned_phobert")
