### 

In [1]:
import time
from neo4j import GraphDatabase
from sentence_transformers import SentenceTransformer

# ================= C·∫§U H√åNH =================
URI = "neo4j+s://403c0411.databases.neo4j.io"
AUTH = ("neo4j", "20qU7hUdIdlzyXii3tmnCDgvAFkUh-NobhE52Oq7Dvw")

# Model embedding ti·∫øng Vi·ªát t·ªët nh·∫•t hi·ªán nay
MODEL_NAME = "bkai-foundation-models/vietnamese-bi-encoder"
# K√≠ch th∆∞·ªõc vector c·ªßa model n√†y l√† 768
VECTOR_DIMENSION = 768 

# T√™n c√°c Index vector s·∫Ω t·∫°o (cho 2 lo·∫°i label ch√≠nh trong code c·ªßa √¥ng)
INDEXES = {
    "searchable_index": "Searchable",
    "general_info_index": "GeneralInfo"
}

def create_vector_indexes(driver):
    """
    T·∫°o Vector Index trong Neo4j ƒë·ªÉ ph·ª•c v·ª• t√¨m ki·∫øm RAG.
    N·∫øu kh√¥ng c√≥ c√°i n√†y, √¥ng kh√¥ng th·ªÉ d√πng h√†m vector similarity search ƒë∆∞·ª£c.
    """
    print("‚öôÔ∏è ƒêang ki·ªÉm tra v√† t·∫°o Vector Index...")
    with driver.session() as session:
        for index_name, label in INDEXES.items():
            # C√¢u l·ªánh Cypher t·∫°o index cho Neo4j 5.x tr·ªü l√™n
            query = f"""
            CREATE VECTOR INDEX `{index_name}` IF NOT EXISTS
            FOR (n:{label}) ON (n.embedding)
            OPTIONS {{
                indexConfig: {{
                    `vector.dimensions`: {VECTOR_DIMENSION},
                    `vector.similarity_function`: 'cosine'
                }}
            }}
            """
            try:
                session.run(query)
                print(f"   ‚úÖ ƒê√£ t·∫°o/ki·ªÉm tra index: {index_name} cho Label: {label}")
            except Exception as e:
                print(f"   ‚ö†Ô∏è L·ªói t·∫°o index {index_name}: {e}")

def fetch_nodes_without_embedding(tx, batch_size=200):
    """
    L·∫•y c√°c node c√≥ 'embedding_text' nh∆∞ng ch∆∞a c√≥ 'embedding' (vector)
    """
    query = """
    MATCH (n)
    WHERE n.embedding_text IS NOT NULL 
      AND n.embedding IS NULL
      AND (n:Searchable OR n:GeneralInfo)
    RETURN elementId(n) AS id, n.embedding_text AS text
    LIMIT $limit
    """
    result = tx.run(query, limit=batch_size)
    return [{"id": record["id"], "text": record["text"]} for record in result]

def update_node_embeddings(tx, updates):
    """
    Update vector ng∆∞·ª£c l·∫°i v√†o Neo4j
    """
    query = """
    UNWIND $updates AS row
    MATCH (n) WHERE elementId(n) = row.id
    
    // S·ª≠a l·ªói: Th√™m YIELD node ƒë·ªÉ tu√¢n th·ªß c√∫ ph√°p Neo4j 5.x
    CALL db.create.setVectorProperty(n, 'embedding', row.vector)
    YIELD node 
    
    // K·∫øt th√∫c b·∫±ng RETURN ƒë·ªÉ transaction ho√†n t·∫•t h·ª£p l·ªá
    RETURN count(node)
    """
    tx.run(query, updates=updates)
    
def main():
    driver = GraphDatabase.driver(URI, auth=AUTH)
    
    # 1. Load Model (Load 1 l·∫ßn d√πng m√£i m√£i)
    print(f"üöÄ ƒêang t·∫£i model AI: {MODEL_NAME}...")
    model = SentenceTransformer(MODEL_NAME)
    print("‚úÖ Model ƒë√£ s·∫µn s√†ng!")

    # 2. T·∫°o Index tr∆∞·ªõc (Quan tr·ªçng)
    create_vector_indexes(driver)

    # 3. V√≤ng l·∫∑p x·ª≠ l√Ω (Batch processing)
    batch_size = 200
    total_processed = 0
    
    print("‚è≥ B·∫Øt ƒë·∫ßu qu√° tr√¨nh Vector h√≥a d·ªØ li·ªáu trong Neo4j...")
    
    with driver.session() as session:
        while True:
            # A. L·∫•y d·ªØ li·ªáu ch∆∞a c√≥ vector
            nodes = session.execute_read(fetch_nodes_without_embedding, batch_size)
            
            if not nodes:
                print("üéâ ƒê√£ x·ª≠ l√Ω h·∫øt to√†n b·ªô d·ªØ li·ªáu! Kh√¥ng c√≤n node n√†o thi·∫øu vector.")
                break
            
            print(f"   üîπ ƒêang x·ª≠ l√Ω batch {len(nodes)} node...")
            
            # B. T√≠nh to√°n Vector (Embedding)
            texts = [n["text"] for n in nodes]
            # encode tr·∫£ v·ªÅ numpy array, c·∫ßn convert sang list ƒë·ªÉ n·∫°p v√†o Neo4j
            embeddings = model.encode(texts, show_progress_bar=False).tolist()
            
            # C. Chu·∫©n b·ªã d·ªØ li·ªáu update
            updates = []
            for i, node in enumerate(nodes):
                updates.append({
                    "id": node["id"],
                    "vector": embeddings[i]
                })
            
            # D. Ghi xu·ªëng DB
            session.execute_write(update_node_embeddings, updates)
            
            total_processed += len(nodes)
            print(f"   ‚úÖ ƒê√£ update xong {total_processed} node.")

    driver.close()

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm


üöÄ ƒêang t·∫£i model AI: bkai-foundation-models/vietnamese-bi-encoder...
‚úÖ Model ƒë√£ s·∫µn s√†ng!
‚öôÔ∏è ƒêang ki·ªÉm tra v√† t·∫°o Vector Index...
   ‚úÖ ƒê√£ t·∫°o/ki·ªÉm tra index: searchable_index cho Label: Searchable
   ‚úÖ ƒê√£ t·∫°o/ki·ªÉm tra index: general_info_index cho Label: GeneralInfo
‚è≥ B·∫Øt ƒë·∫ßu qu√° tr√¨nh Vector h√≥a d·ªØ li·ªáu trong Neo4j...
   üîπ ƒêang x·ª≠ l√Ω batch 200 node...




   ‚úÖ ƒê√£ update xong 200 node.
   üîπ ƒêang x·ª≠ l√Ω batch 200 node...




   ‚úÖ ƒê√£ update xong 400 node.
   üîπ ƒêang x·ª≠ l√Ω batch 200 node...




   ‚úÖ ƒê√£ update xong 600 node.
   üîπ ƒêang x·ª≠ l√Ω batch 200 node...




   ‚úÖ ƒê√£ update xong 800 node.
   üîπ ƒêang x·ª≠ l√Ω batch 200 node...




   ‚úÖ ƒê√£ update xong 1000 node.
   üîπ ƒêang x·ª≠ l√Ω batch 200 node...




   ‚úÖ ƒê√£ update xong 1200 node.
   üîπ ƒêang x·ª≠ l√Ω batch 200 node...




   ‚úÖ ƒê√£ update xong 1400 node.
   üîπ ƒêang x·ª≠ l√Ω batch 78 node...




   ‚úÖ ƒê√£ update xong 1478 node.
üéâ ƒê√£ x·ª≠ l√Ω h·∫øt to√†n b·ªô d·ªØ li·ªáu! Kh√¥ng c√≤n node n√†o thi·∫øu vector.


### Ph∆∞∆°ng ph√°p ƒëang s·ª≠ d·ª•ng l√† Dense Vector Embedding (Nh√∫ng vector m·∫≠t ƒë·ªô cao) s·ª≠ d·ª•ng ki·∫øn tr√∫c Bi-Encoder.
C·ª• th·ªÉ v·ªÅ k·ªπ thu·∫≠t:
#### 1. Ph∆∞∆°ng ph√°p k·ªπ thu·∫≠t: Bi-Encoder Dense RetrievalBi-Encoder l√† g√¨? 
Model bkai-foundation-models/vietnamese-bi-encoder ho·∫°t ƒë·ªông theo c∆° ch·∫ø Bi-Encoder. Nghƒ©a l√† n√≥ "ƒë·ªçc" ƒëo·∫°n vƒÉn b·∫£n (input text) m·ªôt c√°ch ƒë·ªôc l·∫≠p v√† n√©n to√†n b·ªô √Ω nghƒ©a c·ªßa ƒëo·∫°n vƒÉn ƒë√≥ th√†nh m·ªôt chu·ªói s·ªë c·ªë ƒë·ªãnh (vector 768 chi·ªÅu).C∆° ch·∫ø l∆∞u tr·ªØ: ƒêo·∫°n code tr√™n th·ª±c hi·ªán vi·ªác Indexing (L·∫≠p ch·ªâ m·ª•c). N√≥ bi·∫øn ƒë·ªïi d·ªØ li·ªáu th√¥ (Text) -> Vector -> L∆∞u v√†o Neo4j.Thu·∫≠t to√°n so kh·ªõp: Khi t·∫°o index (vector.similarity_function: 'cosine'), b·∫°n ƒëang quy ƒë·ªãnh r·∫±ng sau n√†y vi·ªác t√¨m ki·∫øm s·∫Ω d√πng Cosine Similarity (ƒë·ªô t∆∞∆°ng ƒë·ªìng cosin - g√≥c gi·ªØa 2 vector) ƒë·ªÉ ƒëo ƒë·ªô gi·ªëng nhau v·ªÅ ng·ªØ nghƒ©a.
#### 2. Code n√†y C√ì g√¢y ra "·∫¢o gi√°c ng·ªØ nghƒ©a" kh√¥ng?
C√¢u tr·∫£ l·ªùi l√†: C√ì, b·∫£n th√¢n c√°i vector ƒë∆∞·ª£c t·∫°o ra t·ª´ code n√†y ch·ª©a ƒë·ª±ng nguy c∆° ƒë√≥.T·∫°i sao?
=> N√©n th√¥ng tin (Information Compression): Model BKAI n√©n t·∫•t c·∫£ m·ªçi th·ª©: ƒê·ªãa danh (H√† N·ªôi) + ƒê·ªëi t∆∞·ª£ng (Ch√πa) + T√≠nh ch·∫•t (C·ªï k√≠nh) v√†o chung 1 vector duy nh·∫•t.
Tr·ªçng s·ªë kh√¥ng ki·ªÉm so√°t: Trong qu√° tr√¨nh train, model c√≥ th·ªÉ "h·ªçc" r·∫±ng t·ª´ "Ch√πa" quan tr·ªçng h∆°n t·ª´ "H√† N·ªôi".Khi ƒë√≥: Vector c·ªßa "Ch√πa M·ªôt C·ªôt (H√† N·ªôi)" s·∫Ω n·∫±m r·∫•t g·∫ßn Vector c·ªßa "Ch√πa Thi√™n M·ª• (Hu·∫ø)" (v√¨ c√πng l√† Ch√πa).N√≥ n·∫±m xa Vector c·ªßa "Ph·ªü B√°t ƒê√†n (H√† N·ªôi)" (d√π c√πng ƒë·ªãa l√Ω nh∆∞ng kh√°c h·∫≥n v·ªÅ n·ªôi dung).
#### -> H·∫≠u qu·∫£: N·∫øu b·∫°n ch·ªâ d√πng c√°i vector n√†y ƒë·ªÉ search (Vector Search thu·∫ßn t√∫y), m√°y s·∫Ω ∆∞u ti√™n tr·∫£ v·ªÅ "C√°c ng√¥i ch√πa ·ªü kh·∫Øp n∆°i" thay v√¨ "C√°c ƒë·ªãa ƒëi·ªÉm t·∫°i H√† N·ªôi". ƒê√¢y ch√≠nh l√† nguy√™n nh√¢n g·ªëc r·ªÖ c·ªßa ·∫£o gi√°c ƒë·ªãa l√Ω.

#### => ƒê√¢y l√† b∆∞·ªõc b·∫Øt bu·ªôc ph·∫£i l√†m.B·∫°n kh√¥ng th·ªÉ s·ª≠a "vector" ƒë·ªÉ n√≥ h·∫øt ·∫£o gi√°c (v√¨ b·∫£n ch·∫•t vector l√† x√°c su·∫•t v√† ng·ªØ nghƒ©a m·ªù). Ph·∫£i s·ª≠a ·ªü c√°ch truy v·∫•n (Query Strategy). 
ƒê√¢y l√† b∆∞·ªõc chu·∫©n b·ªã nguy√™n li·ªáu (Data + Vector). 
Ph·∫£i √°p d·ª•ng ph∆∞∆°ng ph√°p Hybrid Search (T√¨m ki·∫øm lai) ·ªü b∆∞·ªõc Vi·∫øt API t√¨m ki·∫øm (b∆∞·ªõc sau):D√πng Graph (Neo4j) ƒë·ªÉ kh√≥a ƒë·ªãa l√Ω: D√πng c√¢u l·ªánh Cypher MATCH (n)-[:LOCATED_IN]->(p:Province {name: 'H√† N·ªôi'}) ƒë·ªÉ khoanh v√πng.D√πng Vector ƒë·ªÉ t√¨m ng·ªØ nghƒ©a: Ch·ªâ so s√°nh vector trong c√°i v√πng ƒë√£ khoanh ƒë√≥.