In [None]:
!pip install transformers torch faiss-cpu tqdm sqlalchemy



In [None]:
import sqlite3, json
import numpy as np
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModel
import faiss

BGE_MODEL = "models/bge-m3-law"
MAX_LENGTH = 512
BATCH_SIZE_GPU = 16
BATCH_SIZE_CPU = 32

FAISS_OUT = "/content/drive/MyDrive/NCKH/PhapLuat_RAG/luu_tru_2/laws_bge.index"
VECTOR_DB = "/content/drive/MyDrive/NCKH/PhapLuat_RAG/luu_tru_2/laws_bge.db"
DB_PATH = "/content/drive/MyDrive/NCKH/PhapLuat_RAG/luu_tru_2/vbplfull.db"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

tokenizer = AutoTokenizer.from_pretrained(BGE_MODEL)
model = AutoModel.from_pretrained(BGE_MODEL).to(device).eval()

@torch.no_grad()
def embed_texts(texts):
    texts = [t if t and str(t).strip() else "[PAD]" for t in texts]
    batch_size = BATCH_SIZE_GPU if device.type=="cuda" else BATCH_SIZE_CPU
    embeddings_parts = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
        enc = {k: v.to(device) for k,v in enc.items()}
        out = model(**enc)
        last_hidden = out.last_hidden_state
        mask = enc["attention_mask"].unsqueeze(-1).expand(last_hidden.size()).float()
        summed = torch.sum(last_hidden * mask, dim=1)
        counts = torch.clamp(mask.sum(dim=1), min=1e-9)
        mean_pooled = summed / counts
        arr = mean_pooled.cpu().numpy()
        arr = arr / (np.linalg.norm(arr, axis=1, keepdims=True) + 1e-9)
        embeddings_parts.append(arr.astype("float32"))

    return np.vstack(embeddings_parts)

def build_docs_from_db(conn):
    docs = []
    sql = """
    SELECT
        v.vanban_id, v.ten as ten_vb, v.so_hieu, v.ngay_ban_hanh, v.ngay_hieu_luc, v.co_quan,
        c.chuong_id, c.ten as ten_chuong,
        d.dieu_id, d.so as so_dieu, d.noi_dung as nd_dieu,
        k.khoan_id, k.so as so_khoan, k.noi_dung as nd_khoan,
        di.diem_id, di.so as so_diem, di.noi_dung as nd_diem
    FROM vanban v
    LEFT JOIN chuong c ON c.vanban_id = v.vanban_id
    LEFT JOIN dieu d ON d.chuong_id = c.chuong_id
    LEFT JOIN khoan k ON d.dieu_id = k.dieu_id
    LEFT JOIN diem di ON k.khoan_id = di.khoan_id
    ORDER BY v.vanban_id, c.chuong_id, d.dieu_id, k.khoan_id, di.diem_id;
    """
    rows = conn.execute(sql).fetchall()

    for r in rows:
        (vanban_id, ten_vb, so_hieu, ngay_bh, ngay_hl, co_quan,
         chuong_id, ten_chuong,
         dieu_id, so_dieu, nd_dieu,
         khoan_id, so_khoan, nd_khoan,
         diem_id, so_diem, nd_diem) = r

        # Nối nhãn theo thứ tự Điều, Khoản, Điểm
        labels = []
        if dieu_id:
            labels.append(f"Điều {so_dieu}")
        if khoan_id:
            labels.append(f"Khoản {so_khoan}")
        if diem_id:
            labels.append(f"Điểm {so_diem}")

        law_info = f"Văn bản {ten_vb} ({so_hieu})"

        # Chọn content ưu tiên Điểm > Khoản > Điều
        content = None
        if diem_id and nd_diem:
            content = nd_diem
        elif khoan_id and nd_khoan:
            content = nd_khoan
        elif dieu_id and nd_dieu:
            content = nd_dieu
        else:
            continue  # không có nội dung thì bỏ

        # text nối đầy đủ
        text = f"[{law_info} - {', '.join(labels)}] {content}"

        # uuid đầy đủ
        uuid_parts = [f"vanban:{vanban_id}", f"chuong:{chuong_id}"]
        if dieu_id:
            uuid_parts.append(f"dieu:{dieu_id}")
        if khoan_id:
            uuid_parts.append(f"khoan:{khoan_id}")
        if diem_id:
            uuid_parts.append(f"diem:{diem_id}")
        uuid = ":".join(uuid_parts)

        metadata = {
            "vanban_id": vanban_id, "ten_vanban": ten_vb, "so_hieu": so_hieu,
            "ngay_ban_hanh": ngay_bh, "ngay_hieu_luc": ngay_hl, "co_quan": co_quan,
            "chuong_id": chuong_id, "ten_chuong": ten_chuong,
            "dieu_id": dieu_id, "so_dieu": so_dieu,
            "khoan_id": khoan_id, "so_khoan": so_khoan,
            "diem_id": diem_id, "so_diem": so_diem
        }

        docs.append({"uuid": uuid, "text": text, "metadata": metadata})

    return docs

def main():
    conn = sqlite3.connect(DB_PATH)
    docs = build_docs_from_db(conn)
    print("Docs built:", len(docs))

    texts = [d["text"] for d in docs]
    embs = embed_texts(texts)
    print("Embeddings done:", embs.shape)

    # FAISS index
    dim = embs.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(embs)
    faiss.write_index(index, FAISS_OUT)
    print("FAISS index saved to", FAISS_OUT)

    # Vector DB
    conn_vec = sqlite3.connect(VECTOR_DB)
    conn_vec.execute("""
    CREATE TABLE IF NOT EXISTS vectors (
        uuid TEXT PRIMARY KEY,
        faiss_id INTEGER,
        doc_text TEXT,
        metadata_json TEXT,
        embedding BLOB,
        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
    );""")
    conn_vec.commit()

    for i, d in enumerate(docs):
        conn_vec.execute("""
            INSERT OR REPLACE INTO vectors (uuid, faiss_id, doc_text, metadata_json, embedding)
            VALUES (?, ?, ?, ?, ?)
        """, (d["uuid"], i, d["text"], json.dumps(d["metadata"], ensure_ascii=False), embs[i].tobytes()))

    conn_vec.commit()
    conn_vec.close()
    conn.close()
    print("Vector DB saved to", VECTOR_DB)

if __name__ == "__main__":
    main()


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

Docs built: 324920


Embedding: 100%|██████████| 20308/20308 [3:11:47<00:00,  1.76it/s]


Embeddings done: (324920, 1024)
FAISS index saved to /content/drive/MyDrive/NCKH/PhapLuat_RAG/luu_tru_2/laws_bge.index
Vector DB saved to /content/drive/MyDrive/NCKH/PhapLuat_RAG/luu_tru_2/laws_bge.db
