In [None]:
!pip install transformers torch faiss-cpu tqdm sqlalchemy
!pip install sentence-transformers pandas tqdm

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"  # tắt Weights & Biases logging

import torch
import pandas as pd
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses
from tqdm import tqdm

TRAIN_FILE = "src/notebook/Data_Fine_Tune/train.xlsx"  # đường dẫn file train
OUTPUT_DIR = "src/notebook/Data_Fine_Tune/BAAI/bge-m3"
BASE_MODEL = "BAAI/bge-m3"
BATCH_SIZE = 4
EPOCHS = 3

def load_data(train_file):
    if train_file.endswith(".csv"):
        df = pd.read_csv(train_file)
    elif train_file.endswith(".xlsx") or train_file.endswith(".xls"):
        df = pd.read_excel(train_file)
    else:
        raise ValueError("Định dạng file không hợp lệ. Chỉ hỗ trợ .csv, .xlsx, .xls")

    # Nếu có cột 'date' hoặc cột khác không cần thiết, bỏ đi
    for col in ["date", "timestamp", "id"]:
        if col in df.columns:
            df = df.drop(col, axis=1)

    df = df.dropna(subset=["question", "answer"])
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)  # shuffle

    print(f"Loaded {len(df):,} training pairs")
    return df

def build_model(base_model):
    print(f"Loading base model: {base_model}")
    model = SentenceTransformer(base_model)
    device = torch.device("cpu")
    model.to(device)
    print("Training on CPU (chậm hơn nhưng tránh OOM)")
    return model

def train(model, df, output_dir, batch_size=8, epochs=3):
    train_examples = [
        InputExample(texts=[row.question, row.answer])
        for row in tqdm(df.itertuples(index=False), total=len(df), desc="Building examples")
    ]

    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
    train_loss = losses.MultipleNegativesRankingLoss(model)

    warmup_steps = int(len(train_dataloader) * epochs * 0.1)

    print("Bắt đầu huấn luyện...")
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=epochs,
        warmup_steps=warmup_steps,
        output_path=output_dir,
        show_progress_bar=True,
        save_best_model=True,
        use_amp=True,
    )

    print(f"Model đã được lưu tại: {output_dir}")

def main():
    df = load_data(TRAIN_FILE)
    model = build_model(BASE_MODEL)
    train(model, df, OUTPUT_DIR, BATCH_SIZE, EPOCHS)


if __name__ == "__main__":
    main()
