In [None]:
from dotenv import load_dotenv
import os

load_dotenv()
qdrant_api_key = os.getenv("qdrant_api_key")


# Load model embedding

In [1]:

import torch
import tqdm


from sentence_transformers import SentenceTransformer
from tqdm import tqdm

class BGEEmbedder:
    def __init__(self, model_name="BAAI/bge-m3"):
        self.model = SentenceTransformer(model_name)
        self.prefix = "Represent this sentence for searching relevant passages: "

    def embed(self, texts, batch_size=16):
        # BGE-M3 y√™u c·∫ßu prefix cho truy v·∫•n v√† vƒÉn b·∫£n
        texts_with_prefix = [
            text if text.startswith(self.prefix) else self.prefix + text
            for text in texts
        ]

        # D√πng encode v·ªõi batch size v√† normalize s·∫µn
        embeddings = self.model.encode(
            texts_with_prefix,
            batch_size=batch_size,
            normalize_embeddings=True,
            show_progress_bar=True
        )
        return embeddings




In [None]:
# from sentence_transformers import SentenceTransformer

# # Load model BGE-M3
# model = SentenceTransformer("BAAI/bge-m3")

# def embed_fn(text: str):
#     # BGE-M3 khuy·∫øn ngh·ªã prefix truy v·∫•n b·∫±ng "Represent this sentence for searching relevant passages:"
#     if not text.startswith("Represent"):
#         text = "Represent this sentence for searching relevant passages: " + text
#     return model.encode(text, normalize_embeddings=True)


# Embedding ph·∫ßn content c·ªßa payload

In [2]:
import json

def load_payloads_json(path):
    with open(path, "r", encoding="utf-8-sig") as f:
        return json.load(f)


In [3]:
payloads = load_payloads_json(r"D:\Learn\K√¨ 5\SEG301\Fap-Chat\data\Chunk_JSON\overview_syllabus_payloads.json")  
payloads[0]

{'subject_code': 'OTP101',
 'subject_name': 'Orientation and General Training Program_ƒê·ªãnh h∆∞·ªõng v√† R√®n luy·ªán t·∫≠p trung',
 'degree_level': 'Bachelor',
 'semester': 0,
 'credits': 0,
 'belong_to_combo': 'nan',
 'type': 'overview',
 'content': "TYPE: overview\nSubject Code: OTP101\nSubject Name: Orientation and General Training Program_ƒê·ªãnh h∆∞·ªõng v√† R√®n luy·ªán t·∫≠p trung\nDegree Level: Bachelor | Credits: 0 | Semester: 0\nBelong To Combo: None\nPre-requisites: None\nScoring Scale: 10.0 | Min Avg Mark to Pass: 0.0\nApproved: True on 8/5/2022\nSubject Link: https://flm.fpt.edu.vn/gui/role/student/Syllabuses.aspx?subCode=OTP101&curriculumID=2347\n\n--- TIME ALLOCATION ---\n5 weeks (fulltime) = 280 h* Module 1: Orientation-ƒê·ªãnh h∆∞·ªõng(1 week: 8 h/day * 5 days = 40 h)* Module 2: Military Training-Gi√°o d·ª•c qu·ªëc ph√≤ng(110 slots * 1.5 h/slot = 165 h)* Module 3: Experience Program 22 slots * 1.5 h = 33 h* Module 4: Vovinam 28 slots * 1,5 h/slot = 42 h\n--- TIME AL

In [4]:
# ho·∫∑c load_payloads_json

contents = [p["content"] for p in payloads]
embedder = BGEEmbedder()               # kh·ªüi t·∫°o instance
vectors = embedder.embed(contents)     # g·ªçi method t·ª´ instance



# client.upsert(collection_name="flm_chunks", points=points)




Batches:   0%|          | 0/5 [00:00<?, ?it/s]

In [6]:
from collections import Counter

types = [p["type"] for p in payloads if "type" in p]
print(Counter(types))


Counter({'overview': 73})


In [15]:
from qdrant_client.models import PointStruct
import uuid

points = [
    PointStruct(
        id=str(uuid.uuid4()),
        vector=vec.tolist(),
        payload=payload
    )
    for vec, payload in zip(vectors, payloads)
]


In [None]:
len(points)

In [None]:
points_session

# G·ªçi Qdrant vector DB

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
from qdrant_client.models import Filter
# client = QdrantClient(
#     url=r"https://0f47d391-b7c1-45d9-a956-5f7228cd80f3.europe-west3-0.gcp.cloud.qdrant.io:6333",
#     api_key=qdrant_api_key,
#     prefer_grpc=False
# )
client = QdrantClient(
    host="localhost",
    port=6333
)

# client.recreate_collection(
#     collection_name="flm_fap",
#     vectors_config=VectorParams(size=1024, distance=Distance.COSINE)
# )
# client.delete(
#     collection_name="flm_fap",
#     points_selector=Filter(must=[])  # Xo√° to√†n b·ªô points
# )


# Up data l√™n vector DB

In [16]:
import time

def safe_upsert(client, collection_name, points, retries=5, wait=5):
    for attempt in range(retries):
        try:
            client.upsert(collection_name=collection_name, points=points)
            return True
        except Exception as e:
            print(f"‚ùå Retry {attempt+1}/{retries} - L·ªói: {e}")
            time.sleep(wait)
    return False
for i in range(0, len(points), 100):
    success = safe_upsert(client, "flm_fap", points[i:i+100])
    if not success:
        print("‚ö†Ô∏è Kh√¥ng th·ªÉ upload batch", i)


In [None]:
count = client.count("flm_fap", exact=True).count
print(f"‚úÖ S·ªë vector ƒë√£ up: {count}")


In [None]:
embedder = BGEEmbedder()  

# Demo search

In [None]:
import pandas as pd
df_flm=pd.read_csv(r'D:\Learn\K√¨ 5\SEG301\DEMO_local\FLM\FINAL_DF_FLM.csv')
subject_map = {
    row["SubjectCode"]: f"{row["SubjectCode"]} - {row["Subject Name"]}"
    for _, row in df_flm[["SubjectCode", "Subject Name"]].dropna().drop_duplicates().iterrows()
}
subject_embeddings = {
    code: embedder.embed([name])[0]
    for code, name in subject_map.items()
}


# T·∫°o filter index (mu·ªën t√¨m theo filter ph·∫£i c√≥ c√°i n√†y)

In [17]:
client.create_payload_index(
    collection_name="flm_fap",
    field_name="semester",
    field_schema="keyword"
)


UpdateResult(operation_id=119, status=<UpdateStatus.COMPLETED: 'completed'>)

In [None]:
client.create_payload_index(
    collection_name="flm_fap",
    field_name="subject_code",
    field_schema="keyword"
)


In [None]:
from numpy import dot
from numpy.linalg import norm
import heapq
def detect_subject(query, top_k=1):
    query_vec = embedder.embed([query])[0]
    
    # T√≠nh cosine similarity v·ªõi t·ª´ng subject
    sims = {
        code: dot(query_vec, emb) / (norm(query_vec) * norm(emb))
        for code, emb in subject_embeddings.items()
    }
    
    # L·∫•y top-k subject code c√≥ ƒëi·ªÉm cao nh·∫•t
    top_subjects = heapq.nlargest(top_k, sims.items(), key=lambda x: x[1])
    
    return top_subjects


In [None]:
from deep_translator import GoogleTranslator

def translate_vi_to_en_google(text):
    return GoogleTranslator(source='vi', target='en').translate(text)


In [None]:
TYPE_DESCRIPTIONS = {
    "overview": "general overview of the subject, goals, credits, syllabus",
    "construtive_question": "thought-provoking questions",
    "assessment": "evaluations, types of tests, exams, and grading weights for the subject",
    "session": "lecture sessions, lessons, schedules, topics covered in each week or session",
    "material": "recommended textbooks, reference materials, slides, or other learning resources",
    "learning outcome": "learning outcome, expected knowledge, skills, or competencies students should achieve after completing the course"
}



type_embeddings = {
    t: embedder.embed([desc])[0] for t, desc in TYPE_DESCRIPTIONS.items()
}

In [None]:





# # --- Step 0: Optional Translate Query (n·∫øu c·∫ßn)
# query = "t·ªïng quan m√¥n search engine"

# # N·∫øu c√≥ h√†m d·ªãch: query_en = translate_vi_to_en_nllb(query)
# query_en = query  # n·∫øu b·∫°n kh√¥ng c·∫ßn d·ªãch ho·∫∑c ƒë√£ l√† ti·∫øng Anh

# --- Step 1: Detect type b·∫±ng embedding
def detect_type_by_embedding(query_en):
    query_vec = embedder.embed([query_en])[0]
    sims = {
        t: dot(query_vec, vec) / (norm(query_vec) * norm(vec))
        for t, vec in type_embeddings.items()
    }
    best_type = max(sims, key=sims.get)
    return best_type 

# query_type = detect_type_by_embedding(query_en)


query = "learning outcome cho m√¥n deep learning"
# query_en = query  # ho·∫∑c d√πng translate n·∫øu c·∫ßn
query_en = translate_vi_to_en_google(query)
print(query_en)

detected_type = detect_type_by_embedding(query_en)
detected_subject = detect_subject(query_en)

query_vec = embedder.embed([query])[0]


# query_filter = {"must": []}
# if detected_type:
#     query_filter["must"].append({"key": "type", "match": {"value": detected_type}})
# if detected_subject:
#     query_filter["must"].append({"key": "subject_code", "match": {"value": detected_subject}})

query_filter = {"should": [], "must":[]}
if detected_type:
    print(detected_type)
    query_filter["must"].append({"key": "type", "match": {"value": detected_type}})
if detected_subject:
    for subject_code in detected_subject:
        print(subject_code[0])
        query_filter["should"].append({
            "key": "subject_code",
            "match": {"value": subject_code[0]}
        })

hits = client.search(
    collection_name="flm_fap",
    query_vector=query_vec.tolist(),
    limit=20,
    query_filter=query_filter if query_filter["must"] or query_filter["should"] else None
)


# --- Step 4: Hi·ªÉn th·ªã k·∫øt qu·∫£
for hit in hits:
    print(f"\nüîç Score: {hit.score:.4f}")
    print(f"üìò Subject: {hit.payload.get('subject_code')} - {hit.payload.get('type')}")
    print(f"üìÑ Content:\n{hit.payload.get('content')}")
    print("--------")


In [None]:
from collections import Counter

# Scroll 1000 ƒëi·ªÉm (ho·∫∑c bao nhi√™u c≈©ng ƒë∆∞·ª£c t√πy collection c·ªßa b·∫°n)
points, _ = client.scroll(
    collection_name="flm_fap",
    limit=2621,  # ho·∫∑c nhi·ªÅu h∆°n n·∫øu mu·ªën
    with_payload=True
)

# L·∫•y t·∫•t c·∫£ c√°c 'type' trong payload
type_list = [point.payload.get("type") for point in points if "type" in point.payload]

# ƒê·∫øm s·ªë l·∫ßn xu·∫•t hi·ªán m·ªói type
type_counts = Counter(type_list)

# In ra k·∫øt qu·∫£
print("C√°c type c√≥ trong collection:")
for t, count in type_counts.items():
    print(f"- {t}: {count} b·∫£n ghi")


In [14]:
from tqdm import tqdm
from qdrant_client.models import PointIdsList

# L·∫•y to√†n b·ªô points
points = client.scroll(
    collection_name="flm_fap",
    scroll_filter=None,
    limit=10000,
    with_payload=True
)[0]

# L·ªçc c√°c ID c√≥ type l√† 'construtive_question'
to_delete = [pt.id for pt in points
             if pt.payload.get("type") == "overview"]

print(f"üîç S·ªë l∆∞·ª£ng ƒëi·ªÉm c·∫ßn x√≥a: {len(to_delete)}")

# X√≥a n·∫øu c√≥
if to_delete:
    client.delete(
        collection_name="flm_fap",
        points_selector=PointIdsList(points=to_delete)
    )
    print("‚úÖ ƒê√£ x√≥a th√†nh c√¥ng.")
else:
    print("‚úÖ Kh√¥ng t√¨m th·∫•y ƒëi·ªÉm n√†o ƒë·ªÉ x√≥a.")


üîç S·ªë l∆∞·ª£ng ƒëi·ªÉm c·∫ßn x√≥a: 65
‚úÖ ƒê√£ x√≥a th√†nh c√¥ng.
