In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader, Dataset
from torch import nn
import torch
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator as ESE
from sklearn.model_selection import train_test_split
from sentence_transformers import evaluation
from tqdm import tqdm
import torch.optim as optim



In [None]:
df = pd.read_csv('../data/data_similarity.csv', usecols=["Describe_1", "Describe_2", "Similarity_score"])

In [None]:
df

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates()
print(df.duplicated().sum())
print(df.shape)

In [None]:
movie_positive = df[df['Similarity_score'] >= 0.7]
train_df_positive, temp_df_positive = train_test_split(movie_positive, test_size=0.2, random_state=42)
valid_df_positive, test_df_positive = train_test_split(temp_df_positive, test_size=0.5, random_state=42)


In [None]:
movie_negative = df[df['Similarity_score'] <= 0.3].sample(n=1000000, random_state=42)
train_df_negative, temp_df_negative = train_test_split(movie_negative, test_size=0.2, random_state=42)
valid_df_negative, test_df_negative = train_test_split(temp_df_negative, test_size=0.5, random_state=42)

In [None]:
movie_hard_negative = df[(df['Similarity_score'] > 0.3) & (df['Similarity_score'] < 0.7)].sample(n=1000000, random_state=42)
train_df_hard_negative, temp_df_hard_negative = train_test_split(movie_hard_negative, test_size=0.2, random_state=42)
valid_df_hard_negative, test_df_hard_negative = train_test_split(temp_df_hard_negative, test_size=0.5, random_state=42)

In [None]:
train_df = pd.concat([train_df_positive, train_df_hard_negative, train_df_negative])
valid_df = pd.concat([valid_df_positive, valid_df_hard_negative, valid_df_negative])
test_df = pd.concat([test_df_positive, test_df_hard_negative, test_df_negative])

train_df = train_df.sample(frac=1, random_state=42)
valid_df = valid_df.sample(frac=1, random_state=42)
test_df = test_df.sample(frac=1, random_state=42)
print(train_df.shape)
print(valid_df.shape)
print(test_df.shape)


In [None]:
train_df.to_parquet("../data/data_train.parquet", engine="fastparquet", index=False)
valid_df.to_parquet("../data/data_valid.parquet", engine="fastparquet", index=False)
test_df.to_parquet("../data/data_test.parquet", engine="fastparquet", index=False)

In [None]:
train_df = pd.read_parquet("../data/data_train.parquet")
valid_df = pd.read_parquet("../data/data_valid.parquet")
test_df = pd.read_parquet("../data/data_test.parquet")

In [None]:
class SBERTDataset(Dataset):
    def __init__(self, df):
        self.sentences1 = df['Describe_1'].tolist()
        self.sentences2 = df['Describe_2'].tolist()
        self.similarity = df['Similarity_score'].tolist()  # Nhãn

    def __len__(self):
        return len(self.similarity)

    def __getitem__(self, idx):
        return {
            "sentence_features": [self.sentences1[idx], self.sentences2[idx]],
            "label": torch.tensor(self.similarity[idx], dtype=torch.float32)
        }

class WeightedSimilarityMSELoss(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.loss_fct = nn.MSELoss(reduction='none')  # 'none' để giữ giá trị loss cho từng sample
        self.model = model  

    def forward(self, sentence_features, labels):
        # Tokenize danh sách các câu
        tokenized = self.model.tokenizer(
            list(sentence_features[0]) + list(sentence_features[1]),  # Ghép hai danh sách câu
            padding=True, truncation=True, return_tensors="pt"
        ).to(self.model.device)  # Đưa vào GPU nếu có

        # Lấy embeddings từ transformer model
        output = self.model(tokenized)
        embeddings = output['sentence_embedding']  # Chứa embeddings của cả hai danh sách câu

        # Chia thành embeddings1 và embeddings2
        embeddings1, embeddings2 = embeddings.chunk(2, dim=0)

        # Tính cosine similarity
        cos_sim = nn.functional.cosine_similarity(embeddings1, embeddings2, dim=-1)

        # Tính loss MSE giữa cosine similarity và nhãn thực tế
        loss = self.loss_fct(cos_sim, labels.to(self.model.device))

        # Áp dụng trọng số dựa trên nhãn
        weights = torch.ones_like(labels).to(self.model.device)
        weights[labels <= 0.3] = 1.5
        weights[(labels > 0.3) & (labels < 0.7)] = 1.5
        weights[labels >= 0.7] = 50

        # Nhân trọng số vào loss
        weighted_loss = loss * weights

        return weighted_loss.mean()



In [None]:
train_data = SBERTDataset(train_df)
val_data = SBERTDataset(valid_df)


train_dataloader = DataLoader(train_data, batch_size=64)

val_examples = [
    InputExample(texts=[val_data[i]["sentence_features"][0], val_data[i]["sentence_features"][1]], 
                 label=val_data[i]["label"].item())  # .item() để chuyển tensor thành số thực
    for i in range(len(val_data))
]# Chuyển thành list
val_evaluator = ESE.from_input_examples(val_examples, name="val")

model = SentenceTransformer("all-MiniLM-L6-v2")
train_loss = WeightedSimilarityMSELoss(model)

optimizer = optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)


In [None]:
for epoch in range(5):
    model.train()
    epoch_iterator = tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}")

    for batch in epoch_iterator:
        batch_labels = batch["label"] 
        # Chia batch thành 2 danh sách riêng biệt
        batch_sentences = batch["sentence_features"]  # Chuyển tuple thành list
        sentences1 = list(batch_sentences[0]) 
        sentences2 = list(batch_sentences[1])

        batch_sentences = (sentences1, sentences2)

        # Tính loss
        loss = train_loss(batch_sentences, batch_labels)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Cập nhật thanh tiến trình
        epoch_iterator.set_postfix(loss=loss.item())

    # Đánh giá sau mỗi epoch
    model.eval()
    val_evaluator(model, output_path="../model/fine_tuned_sbert_movies")


In [None]:
# # Tạo Evaluator cho tập test
# test_evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(test_data, name="test")

# test_score = test_evaluator(model)
# print(f"Test Evaluation Score: {test_score:.2f}")

In [None]:
# movie1_embeddings = model.encode(test_data["Movie_1"].tolist(), convert_to_tensor=True)
# movie2_embeddings = model.encode(test_data["Movie_2"].tolist(), convert_to_tensor=True)

# # Tính cosine similarity giữa movie1 & movie2
# cosine_similarities = torch.nn.functional.cosine_similarity(movie1_embeddings, movie2_embeddings).cpu().numpy()

# # Giá trị thực tế từ tập test
# true_scores = test_data["Similarity_score"].values

# # Đánh giá bằng MSE và Pearson correlation
# from sklearn.metrics import mean_squared_error
# from scipy.stats import pearsonr

# mse = mean_squared_error(true_scores, cosine_similarities)
# pearson_corr, _ = pearsonr(true_scores, cosine_similarities)

# print(f" Mean Squared Error (MSE): {mse:.4f}")
# print(f" Pearson Correlation: {pearson_corr:.4f}")