In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, Dataset
from torch import nn
import torch
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from typing import List, Tuple, Dict
import pickle
from sklearn.metrics import precision_score, recall_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df = pd.read_csv('../data/data_similarity.csv')

In [None]:
df

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates()
print(df.duplicated().sum())
print(df.shape)

In [None]:
movie_positive = df[df['Similarity_score'] >= 0.7]


In [None]:
import pandas as pd
import torch
from transformers import MarianMTModel, MarianTokenizer, T5ForConditionalGeneration, T5Tokenizer, AutoTokenizer, AutoModel
from underthesea import sent_tokenize
from tqdm import tqdm
import numpy as np
import logging

# Thiết lập logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load mô hình với exception handling
def load_model(model_class, model_name, device):
    try:
        model = model_class.from_pretrained(model_name).to(device)
        tokenizer = model.tokenizer_class.from_pretrained(model_name)
        logging.info(f"Loaded {model_name} successfully")
        return model, tokenizer
    except Exception as e:
        logging.error(f"Error loading {model_name}: {e}")
        raise

vi_en_model, vi_en_tokenizer = load_model(MarianMTModel, "Helsinki-NLP/opus-mt-vi-en", device)
en_vi_model, en_vi_tokenizer = load_model(MarianMTModel, "Helsinki-NLP/opus-mt-en-vi", device)
t5_model, t5_tokenizer = load_model(T5ForConditionalGeneration, "t5-base", device)

# Hàm chia văn bản dài
def split_long_text(text, max_length=512):
    """Chia câu dài thành các đoạn nhỏ, giữ ý nghĩa"""
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""
    current_length = 0
    
    for sent in sentences:
        sent_tokens = t5_tokenizer.encode(sent, add_special_tokens=False)
        if current_length + len(sent_tokens) <= max_length:
            current_chunk += " " + sent
            current_length += len(sent_tokens)
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sent
            current_length = len(sent_tokens)
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks if chunks else [text[:max_length]]

def translate_text(text, model, tokenizer, max_length=512):
    """Dịch văn bản với batch processing"""
    inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs)
    return [tokenizer.decode(out, skip_special_tokens=True) for out in outputs]

def paraphrase_en(text, num_variants=10):
    """Paraphrase tiếng Anh bằng T5"""
    t5_input = f"paraphrase: {text}"
    inputs = t5_tokenizer(t5_input, return_tensors="pt", max_length=512, truncation=True).to(device)
    with torch.no_grad():
        outputs = t5_model.generate(
            **inputs,
            max_length=512,
            num_return_sequences=num_variants,
            do_sample=True,
            top_p=0.95,
            top_k=50,
            temperature=0.9
        )
    return [t5_tokenizer.decode(out, skip_special_tokens=True) for out in outputs]

def augment_text(text, num_variants=50):
    """Tăng cường văn bản: tóm tắt, dịch, paraphrase, dịch ngược"""
    try:
        chunks = split_long_text(text)
        augmented_chunks = []
        
        for chunk in chunks:
            en_chunk = translate_text(chunk, vi_en_model, vi_en_tokenizer)[0]
            en_variants = paraphrase_en(en_chunk, num_variants=min(10, num_variants))
            vi_variants = translate_text(en_variants, en_vi_model, en_vi_tokenizer)
            augmented_chunks.append(vi_variants)
        
        total_variants = []
        for i in range(min(num_variants, len(augmented_chunks[0]))):
            variant = " ".join(chunk[i % len(chunk)] for chunk in augmented_chunks)
            total_variants.append(variant)
        return total_variants[:num_variants]
    except Exception as e:
        logging.error(f"Error augmenting text: {e}")
        return [text] * num_variants

# Hàm chia văn bản thành chunks cho PhoBERT
def split_into_chunks(tokens, max_length=256, stride=128):
    chunks = []
    for i in range(0, len(tokens), stride):
        chunk = tokens[i:i + max_length]
        if len(chunk) > 0:
            chunks.append(chunk)
    return chunks

# Hàm trích xuất embedding tối ưu với PhoBERT
def extract_embeddings(texts, tokenizer, model, max_length=256, stride=128, device='cpu', batch_size=32):
    embeddings = []
    model.to(device)
    model.eval()

    for i in tqdm(range(0, len(texts), batch_size), desc="Computing embeddings"):
        batch_texts = texts[i:i + batch_size]
        batch_chunks = []
        for text in batch_texts:
            text = str(text).strip()
            tokens = tokenizer.encode(text, add_special_tokens=True)
            chunks = split_into_chunks(tokens, max_length, stride)
            if not chunks:
                embeddings.append(torch.zeros(768).to(device))
                continue
            batch_chunks.append(chunks)

        max_chunks = max(len(chunks) for chunks in batch_chunks)
        input_ids_batch = []
        attention_mask_batch = []
        
        for chunks in batch_chunks:
            for chunk in chunks:
                padded_chunk = chunk + [tokenizer.pad_token_id] * (max_length - len(chunk))
                attention_mask = [1] * len(chunk) + [0] * (max_length - len(chunk))
                input_ids_batch.append(padded_chunk)
                attention_mask_batch.append(attention_mask)
            for _ in range(max_chunks - len(chunks)):
                input_ids_batch.append([tokenizer.pad_token_id] * max_length)
                attention_mask_batch.append([0] * max_length)

        input_ids = torch.tensor(input_ids_batch).to(device)
        attention_mask = torch.tensor(attention_mask_batch).to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]

        start_idx = 0
        for chunks in batch_chunks:
            num_chunks = len(chunks)
            if num_chunks > 0:
                text_emb = cls_embeddings[start_idx:start_idx + num_chunks].mean(dim=0)
            else:
                text_emb = torch.zeros(768).to(device)
            embeddings.append(text_emb)
            start_idx += max_chunks

    return embeddings

# Hàm Mixup embeddings
def mixup_embeddings(embeddings_1, embeddings_2, scores, target_size=500000):
    augmented_data = []
    current_size = len(embeddings_1)
    samples_needed = target_size - current_size
    
    for _ in tqdm(range(samples_needed), desc="Mixup augmentation"):
        idx1, idx2 = np.random.choice(current_size, 2)
        lambda_ = np.random.beta(0.2, 0.2)
        new_emb1 = lambda_ * embeddings_1[idx1] + (1 - lambda_) * embeddings_1[idx2]
        new_emb2 = lambda_ * embeddings_2[idx1] + (1 - lambda_) * embeddings_2[idx2]
        new_score = lambda_ * scores[idx1] + (1 - lambda_) * scores[idx2]
        augmented_data.append([new_emb1, new_emb2, new_score])
    
    return augmented_data

# Hàm tăng cường lớp 2
def augment_class_2(train_df, num_text_variants=50, target_size=500000):
    class_2_df = train_df[train_df['Similarity_score'] > 0.6]
    logging.info(f"Original class 2 size: {len(class_2_df)}")
    
    # Bước 1: Tăng cường văn bản
    augmented_data = []
    for _, row in tqdm(class_2_df.iterrows(), total=len(class_2_df), desc="Text augmentation"):
        desc1_variants = augment_text(row['Describe_1'], num_variants=num_text_variants)
        desc2_variants = augment_text(row['Describe_2'], num_variants=num_text_variants)
        for d1, d2 in zip(desc1_variants, desc2_variants):
            augmented_data.append([d1, d2, row['Similarity_score']])
    
    text_augmented_df = pd.DataFrame(augmented_data, columns=['Describe_1', 'Describe_2', 'Similarity_score'])
    logging.info(f"After text augmentation: {len(text_augmented_df)}")
    
    # Bước 2: Tính embedding với PhoBERT
    phobert_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
    phobert_model = AutoModel.from_pretrained("vinai/phobert-base").to(device)
    
    texts_1 = text_augmented_df['Describe_1'].tolist()
    texts_2 = text_augmented_df['Describe_2'].tolist()

    embeddings_1 = extract_embeddings(
        texts_1,
        tokenizer=phobert_tokenizer,
        model=phobert_model,
        max_length=256,
        stride=128,
        device=device,
        batch_size=32
    )
    embeddings_2 = extract_embeddings(
        texts_2,
        tokenizer=phobert_tokenizer,
        model=phobert_model,
        max_length=256,
        stride=128,
        device=device,
        batch_size=32
    )

    # Chuyển embeddings từ tensor sang numpy
    embeddings_1 = [emb.cpu().numpy() for emb in embeddings_1]
    embeddings_2 = [emb.cpu().numpy() for emb in embeddings_2]
    
    # Lấy scores
    scores = text_augmented_df['Similarity_score'].tolist()
    
    # Bước 3: Mixup
    mixup_data = mixup_embeddings(embeddings_1, embeddings_2, scores, target_size=target_size)
    
    # Kết hợp dữ liệu
    final_data = augmented_data + [[None, None, score] for _, _, score in mixup_data]
    final_df = pd.DataFrame(final_data, columns=['Describe_1', 'Describe_2', 'Similarity_score'])
    logging.info(f"Final class 2 size: {len(final_df)}")
    return final_df

augmented_df = augment_class_2(movie_positive, num_text_variants=50, target_size=500000)
movie_positive = pd.concat([movie_positive, augmented_df], ignore_index=True)
movie_positive.to_csv('train_balanced_500k.csv', index=False)


In [None]:
movie_positive

In [None]:
train_df_positive, temp_df_positive = train_test_split(movie_positive, test_size=0.2, random_state=42)
valid_df_positive, test_df_positive = train_test_split(temp_df_positive, test_size=0.5, random_state=42)

In [None]:
movie_negative = df[df['Similarity_score'] <= 0.3].sample(n=500000, random_state=42)
train_df_negative, temp_df_negative = train_test_split(movie_negative, test_size=0.2, random_state=42)
valid_df_negative, test_df_negative = train_test_split(temp_df_negative, test_size=0.5, random_state=42)

In [None]:
movie_hard_negative = df[(df['Similarity_score'] > 0.3) & (df['Similarity_score'] < 0.7)].sample(n=500000, random_state=42)
train_df_hard_negative, temp_df_hard_negative = train_test_split(movie_hard_negative, test_size=0.2, random_state=42)
valid_df_hard_negative, test_df_hard_negative = train_test_split(temp_df_hard_negative, test_size=0.5, random_state=42)

In [None]:
train_df = pd.concat([train_df_positive, train_df_hard_negative, train_df_negative])
valid_df = pd.concat([valid_df_positive, valid_df_hard_negative, valid_df_negative])
test_df = pd.concat([test_df_positive, test_df_hard_negative, test_df_negative])

train_df = train_df.sample(frac=1, random_state=42)
valid_df = valid_df.sample(frac=1, random_state=42)
test_df = test_df.sample(frac=1, random_state=42)
print(train_df.shape)
print(valid_df.shape)
print(test_df.shape)


In [None]:
train_df.to_parquet("../data/data_train.parquet", engine="fastparquet", index=False)
valid_df.to_parquet("../data/data_valid.parquet", engine="fastparquet", index=False)
test_df.to_parquet("../data/data_test.parquet", engine="fastparquet", index=False)

In [2]:
train_df = pd.read_parquet("../data/data_train.parquet")
valid_df = pd.read_parquet("../data/data_valid.parquet")
test_df = pd.read_parquet("../data/data_test.parquet")

In [None]:
train_df

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Định nghĩa mô hình MLP tối ưu
class PhoBERTFineTuneClassifier(nn.Module):
    def __init__(self, num_classes: int = 3, hidden_size: int = 768):
        """
        Mô hình MLP tối ưu để fine-tune trên embedding PhoBERT.
        
        Args:
            num_classes: Số lớp đầu ra.
            hidden_size: Kích thước embedding từ PhoBERT (mặc định 768).
        """
        super().__init__()
        self.dropout = nn.Dropout(0.3)  # Tăng dropout để giảm overfitting
        self.mlp = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),  # Kết hợp 2 embedding
            nn.BatchNorm1d(hidden_size),  # Ổn định huấn luyện
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, hidden_size // 2),  # Thêm lớp ẩn
            nn.BatchNorm1d(hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_size // 2, num_classes)
        )

    def forward(self, embedding1: torch.Tensor, embedding2: torch.Tensor) -> torch.Tensor:
        """Forward pass của mô hình."""
        combined_emb = torch.cat((embedding1, embedding2), dim=-1)
        combined_emb = self.dropout(combined_emb)
        logits = self.mlp(combined_emb)
        return logits

# Định nghĩa dataset
class PhoBertDataset(Dataset):
    def __init__(self, df: pd.DataFrame, embeddings_file: str, movie_desc_df: pd.DataFrame):
        with open(embeddings_file, 'rb') as f:
            self.movie_embeddings = [torch.from_numpy(emb).float() for emb in pickle.load(f)]
        self.movie_df = movie_desc_df
        self.sentences1 = df['Describe_1'].astype(str).tolist()
        self.sentences2 = df['Describe_2'].astype(str).tolist()
        self.similarity = df['Similarity_score'].tolist()
        self.labels = [self._convert_to_class(score) for score in self.similarity]
        self.desc_to_idx = {desc: idx for idx, desc in enumerate(self.movie_df['describe'])}

    def _convert_to_class(self, score: float) -> int:
        if 0 <= score <= 0.3:
            return 0
        elif 0.3 < score < 0.7:
            return 1
        return 2

    def __len__(self) -> int:
        return len(self.labels)

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        desc1_idx = self.desc_to_idx.get(self.sentences1[idx], -1)
        desc2_idx = self.desc_to_idx.get(self.sentences2[idx], -1)
        if desc1_idx == -1 or desc2_idx == -1:
            raise KeyError(f"Embedding not found for {self.sentences1[idx]} or {self.sentences2[idx]}")
        return {
            'embedding1': self.movie_embeddings[desc1_idx],
            'embedding2': self.movie_embeddings[desc2_idx],
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Collate function
def collate_fn(batch: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
    return {
        'embedding1': torch.stack([item['embedding1'] for item in batch]),
        'embedding2': torch.stack([item['embedding2'] for item in batch]),
        'label': torch.stack([item['label'] for item in batch])
    }

# Hàm đánh giá
def evaluate_model(model: nn.Module, dataloader: DataLoader, criterion: nn.Module, device: torch.device) -> Tuple[float, float, float]:
    """Đánh giá mô hình trên tập dữ liệu."""
    model.eval()
    all_preds, all_labels = [], []
    total_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            embedding1 = batch['embedding1'].to(device)
            embedding2 = batch['embedding2'].to(device)
            labels = batch['label'].to(device)
            logits = model(embedding1, embedding2)
            loss = criterion(logits, labels)
            total_loss += loss.item() * labels.size(0)
            preds = torch.argmax(logits, dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(dataloader.dataset)
    precision = precision_score(all_labels, all_preds, average='weighted', zero_division=0)
    f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=0)
    print(f"Loss: {avg_loss:.4f} | Precision: {precision:.4f} | F1: {f1:.4f}")
    return avg_loss, precision, f1

def train_model(model: nn.Module, train_loader: DataLoader, val_loader: DataLoader, criterion: nn.Module, 
                optimizer: torch.optim.Optimizer, scheduler, epochs: int = 10, device=device) -> None:
    """Huấn luyện mô hình với early stopping và scheduler."""
    best_f1, patience, patience_counter = 0.0, 3, 0
    scaler = torch.amp.GradScaler()

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
            embedding1 = batch['embedding1'].to(device)
            embedding2 = batch['embedding2'].to(device)
            labels = batch['label'].to(device)
            optimizer.zero_grad()
            with torch.amp.autocast(device_type='cuda' if torch.cuda.is_available() else 'cpu'):
                logits = model(embedding1, embedding2)
                loss = criterion(logits, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            train_loss += loss.item() * labels.size(0)

        avg_train_loss = train_loss / len(train_loader.dataset)
        print(f"Train Loss: {avg_train_loss:.4f}")

        # Đánh giá trên validation
        val_loss, val_precision, val_f1 = evaluate_model(model, val_loader, criterion, device)
        scheduler.step(val_loss)  # Điều chỉnh learning rate
        
        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save(model.state_dict(), 'best_phobert_finetuned.pt')
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered")
                break

# Main execution
if __name__ == "__main__":
    # Chuẩn bị dữ liệu
    movies_df = pd.read_csv('data-film-final.csv')
    
    train_df = pd.read_csv('data_train.csv')
    valid_df = pd.read_csv('data_valid.csv')
    test_df = pd.read_csv('data_test.csv')

    train_data = PhoBertDataset(train_df, "embedding_train.pkl", movies_df)
    val_data = PhoBertDataset(valid_df, "embedding_train.pkl", movies_df)
    test_data = PhoBertDataset(test_df, "embedding_train.pkl", movies_df)

    train_loader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collate_fn, num_workers=4)
    val_loader = DataLoader(val_data, batch_size=32, shuffle=False, collate_fn=collate_fn, num_workers=4)
    test_loader = DataLoader(test_data, batch_size=32, shuffle=False, collate_fn=collate_fn, num_workers=4)

    # Khởi tạo mô hình và huấn luyện
    model = PhoBERTFineTuneClassifier(num_classes=3).to(device)
    criterion = nn.CrossEntropyLoss(weight=torch.tensor([1.5, 1.5, 50.0]).to(device))
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)

    train_model(model, train_loader, val_loader, criterion, optimizer, scheduler)

    # Đánh giá trên tập test
    print("Evaluating on test set:")
    test_loss, test_precision, test_f1 = evaluate_model(model, test_loader, criterion, device)

  return F.mse_loss(input, target, reduction=self.reduction)
Epoch 1:   0%|          | 1/100036 [02:25<4051:13:06, 145.79s/it, loss=0.106]


KeyboardInterrupt: 

In [None]:
model.save_pretrained("fine_tuned_phobert")
tokenizer.save_pretrained("fine_tuned_phobert")


In [18]:
df = pd.read_csv('../data/data-film-final.csv')
tokenized_lengths = [len(tokenizer.encode(sentence)) for sentence in df['describe']]
max_token_length = max(tokenized_lengths)
print(f"Độ dài tokens lớn nhất: {max_token_length}")

Độ dài tokens lớn nhất: 3242
