In [None]:
from dotenv import load_dotenv
import os

load_dotenv()
qdrant_api_key = os.getenv("qdrant_api_key")


# Load model embedding

In [1]:

import torch
import tqdm


from sentence_transformers import SentenceTransformer
from tqdm import tqdm

class BGEEmbedder:
    def __init__(self, model_name="BAAI/bge-m3"):
        self.model = SentenceTransformer(model_name)
        self.prefix = "Represent this sentence for searching relevant passages: "

    def embed(self, texts, batch_size=16):
        # BGE-M3 yêu cầu prefix cho truy vấn và văn bản
        texts_with_prefix = [
            text if text.startswith(self.prefix) else self.prefix + text
            for text in texts
        ]

        # Dùng encode với batch size và normalize sẵn
        embeddings = self.model.encode(
            texts_with_prefix,
            batch_size=batch_size,
            normalize_embeddings=True,
            show_progress_bar=True
        )
        return embeddings




In [None]:
# from sentence_transformers import SentenceTransformer

# # Load model BGE-M3
# model = SentenceTransformer("BAAI/bge-m3")

# def embed_fn(text: str):
#     # BGE-M3 khuyến nghị prefix truy vấn bằng "Represent this sentence for searching relevant passages:"
#     if not text.startswith("Represent"):
#         text = "Represent this sentence for searching relevant passages: " + text
#     return model.encode(text, normalize_embeddings=True)


# Embedding phần content của payload

In [2]:
import json

def load_payloads_json(path):
    with open(path, "r", encoding="utf-8-sig") as f:
        return json.load(f)


In [3]:
payloads = load_payloads_json(r"D:\Learn\Kì 5\SEG301\Fap-Chat\data\Chunk_JSON\overview_syllabus_payloads.json")  
payloads[0]

{'subject_code': 'OTP101',
 'subject_name': 'Orientation and General Training Program_Định hướng và Rèn luyện tập trung',
 'degree_level': 'Bachelor',
 'semester': 0,
 'credits': 0,
 'belong_to_combo': 'nan',
 'type': 'overview',
 'content': "TYPE: overview\nSubject Code: OTP101\nSubject Name: Orientation and General Training Program_Định hướng và Rèn luyện tập trung\nDegree Level: Bachelor | Credits: 0 | Semester: 0\nBelong To Combo: None\nPre-requisites: None\nScoring Scale: 10.0 | Min Avg Mark to Pass: 0.0\nApproved: True on 8/5/2022\nSubject Link: https://flm.fpt.edu.vn/gui/role/student/Syllabuses.aspx?subCode=OTP101&curriculumID=2347\n\n--- TIME ALLOCATION ---\n5 weeks (fulltime) = 280 h* Module 1: Orientation-Định hướng(1 week: 8 h/day * 5 days = 40 h)* Module 2: Military Training-Giáo dục quốc phòng(110 slots * 1.5 h/slot = 165 h)* Module 3: Experience Program 22 slots * 1.5 h = 33 h* Module 4: Vovinam 28 slots * 1,5 h/slot = 42 h\n--- TIME ALLOCATION END ---\n\n--- DESCRIPTION 

In [4]:
# hoặc load_payloads_json

contents = [p["content"] for p in payloads]
embedder = BGEEmbedder()               # khởi tạo instance
vectors = embedder.embed(contents)     # gọi method từ instance



# client.upsert(collection_name="flm_chunks", points=points)




Batches:   0%|          | 0/5 [00:00<?, ?it/s]

In [6]:
from collections import Counter

types = [p["type"] for p in payloads if "type" in p]
print(Counter(types))


Counter({'overview': 73})


In [15]:
from qdrant_client.models import PointStruct
import uuid

points = [
    PointStruct(
        id=str(uuid.uuid4()),
        vector=vec.tolist(),
        payload=payload
    )
    for vec, payload in zip(vectors, payloads)
]


In [None]:
len(points)

In [None]:
points_session

# Gọi Qdrant vector DB

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
from qdrant_client.models import Filter
# client = QdrantClient(
#     url=r"https://0f47d391-b7c1-45d9-a956-5f7228cd80f3.europe-west3-0.gcp.cloud.qdrant.io:6333",
#     api_key=qdrant_api_key,
#     prefer_grpc=False
# )
client = QdrantClient(
    host="localhost",
    port=6333
)

# client.recreate_collection(
#     collection_name="flm_fap",
#     vectors_config=VectorParams(size=1024, distance=Distance.COSINE)
# )
# client.delete(
#     collection_name="flm_fap",
#     points_selector=Filter(must=[])  # Xoá toàn bộ points
# )


# Up data lên vector DB

In [16]:
import time

def safe_upsert(client, collection_name, points, retries=5, wait=5):
    for attempt in range(retries):
        try:
            client.upsert(collection_name=collection_name, points=points)
            return True
        except Exception as e:
            print(f"❌ Retry {attempt+1}/{retries} - Lỗi: {e}")
            time.sleep(wait)
    return False
for i in range(0, len(points), 100):
    success = safe_upsert(client, "flm_fap", points[i:i+100])
    if not success:
        print("⚠️ Không thể upload batch", i)


In [None]:
count = client.count("flm_fap", exact=True).count
print(f"✅ Số vector đã up: {count}")


In [None]:
embedder = BGEEmbedder()  

# Demo search

In [None]:
import pandas as pd
df_flm=pd.read_csv(r'D:\Learn\Kì 5\SEG301\DEMO_local\FLM\FINAL_DF_FLM.csv')
subject_map = {
    row["SubjectCode"]: f"{row["SubjectCode"]} - {row["Subject Name"]}"
    for _, row in df_flm[["SubjectCode", "Subject Name"]].dropna().drop_duplicates().iterrows()
}
subject_embeddings = {
    code: embedder.embed([name])[0]
    for code, name in subject_map.items()
}


# Tạo filter index (muốn tìm theo filter phải có cái này)

In [17]:
client.create_payload_index(
    collection_name="flm_fap",
    field_name="semester",
    field_schema="keyword"
)


UpdateResult(operation_id=119, status=<UpdateStatus.COMPLETED: 'completed'>)

In [None]:
client.create_payload_index(
    collection_name="flm_fap",
    field_name="subject_code",
    field_schema="keyword"
)


In [None]:
from numpy import dot
from numpy.linalg import norm
import heapq
def detect_subject(query, top_k=1):
    query_vec = embedder.embed([query])[0]
    
    # Tính cosine similarity với từng subject
    sims = {
        code: dot(query_vec, emb) / (norm(query_vec) * norm(emb))
        for code, emb in subject_embeddings.items()
    }
    
    # Lấy top-k subject code có điểm cao nhất
    top_subjects = heapq.nlargest(top_k, sims.items(), key=lambda x: x[1])
    
    return top_subjects


In [None]:
from deep_translator import GoogleTranslator

def translate_vi_to_en_google(text):
    return GoogleTranslator(source='vi', target='en').translate(text)


In [None]:
TYPE_DESCRIPTIONS = {
    "overview": "general overview of the subject, goals, credits, syllabus",
    "construtive_question": "thought-provoking questions",
    "assessment": "evaluations, types of tests, exams, and grading weights for the subject",
    "session": "lecture sessions, lessons, schedules, topics covered in each week or session",
    "material": "recommended textbooks, reference materials, slides, or other learning resources",
    "learning outcome": "learning outcome, expected knowledge, skills, or competencies students should achieve after completing the course"
}



type_embeddings = {
    t: embedder.embed([desc])[0] for t, desc in TYPE_DESCRIPTIONS.items()
}

In [None]:





# # --- Step 0: Optional Translate Query (nếu cần)
# query = "tổng quan môn search engine"

# # Nếu có hàm dịch: query_en = translate_vi_to_en_nllb(query)
# query_en = query  # nếu bạn không cần dịch hoặc đã là tiếng Anh

# --- Step 1: Detect type bằng embedding
def detect_type_by_embedding(query_en):
    query_vec = embedder.embed([query_en])[0]
    sims = {
        t: dot(query_vec, vec) / (norm(query_vec) * norm(vec))
        for t, vec in type_embeddings.items()
    }
    best_type = max(sims, key=sims.get)
    return best_type 

# query_type = detect_type_by_embedding(query_en)


query = "learning outcome cho môn deep learning"
# query_en = query  # hoặc dùng translate nếu cần
query_en = translate_vi_to_en_google(query)
print(query_en)

detected_type = detect_type_by_embedding(query_en)
detected_subject = detect_subject(query_en)

query_vec = embedder.embed([query])[0]


# query_filter = {"must": []}
# if detected_type:
#     query_filter["must"].append({"key": "type", "match": {"value": detected_type}})
# if detected_subject:
#     query_filter["must"].append({"key": "subject_code", "match": {"value": detected_subject}})

query_filter = {"should": [], "must":[]}
if detected_type:
    print(detected_type)
    query_filter["must"].append({"key": "type", "match": {"value": detected_type}})
if detected_subject:
    for subject_code in detected_subject:
        print(subject_code[0])
        query_filter["should"].append({
            "key": "subject_code",
            "match": {"value": subject_code[0]}
        })

hits = client.search(
    collection_name="flm_fap",
    query_vector=query_vec.tolist(),
    limit=20,
    query_filter=query_filter if query_filter["must"] or query_filter["should"] else None
)


# --- Step 4: Hiển thị kết quả
for hit in hits:
    print(f"\n🔍 Score: {hit.score:.4f}")
    print(f"📘 Subject: {hit.payload.get('subject_code')} - {hit.payload.get('type')}")
    print(f"📄 Content:\n{hit.payload.get('content')}")
    print("--------")


In [None]:
from collections import Counter

# Scroll 1000 điểm (hoặc bao nhiêu cũng được tùy collection của bạn)
points, _ = client.scroll(
    collection_name="flm_fap",
    limit=2621,  # hoặc nhiều hơn nếu muốn
    with_payload=True
)

# Lấy tất cả các 'type' trong payload
type_list = [point.payload.get("type") for point in points if "type" in point.payload]

# Đếm số lần xuất hiện mỗi type
type_counts = Counter(type_list)

# In ra kết quả
print("Các type có trong collection:")
for t, count in type_counts.items():
    print(f"- {t}: {count} bản ghi")


In [14]:
from tqdm import tqdm
from qdrant_client.models import PointIdsList

# Lấy toàn bộ points
points = client.scroll(
    collection_name="flm_fap",
    scroll_filter=None,
    limit=10000,
    with_payload=True
)[0]

# Lọc các ID có type là 'construtive_question'
to_delete = [pt.id for pt in points
             if pt.payload.get("type") == "overview"]

print(f"🔍 Số lượng điểm cần xóa: {len(to_delete)}")

# Xóa nếu có
if to_delete:
    client.delete(
        collection_name="flm_fap",
        points_selector=PointIdsList(points=to_delete)
    )
    print("✅ Đã xóa thành công.")
else:
    print("✅ Không tìm thấy điểm nào để xóa.")


🔍 Số lượng điểm cần xóa: 65
✅ Đã xóa thành công.
