In [1]:
import duckdb
import pandas

db_file = "../ingest_data/tin_chi.duckdb"


try:
    with duckdb.connect(database=db_file, read_only=False) as conn:

        print("\n--- B·∫£ng: GiangVien ---")
        giangvien_df = conn.execute("SELECT * FROM GiangVien").df()
        print(giangvien_df)

        # 2. Ki·ªÉm tra b·∫£ng MonHoc
        print("\n--- B·∫£ng: MonHoc ---")
        monhoc_df = conn.execute("SELECT * FROM MonHoc").df()
        print(monhoc_df)

        # 3. Ki·ªÉm tra b·∫£ng LopHocPhan
        print("\n--- B·∫£ng: LopHocPhan ---")
        lophocphan_df = conn.execute("SELECT * FROM LopHocPhan").df()
        print(lophocphan_df)

except Exception as e:
    print(f"ƒê√£ c√≥ l·ªói x·∫£y ra: {e}")


--- B·∫£ng: GiangVien ---
    MaGV           TenGV                 Khoa
0  GV001   Nguy·ªÖn VƒÉn An  C√¥ng ngh·ªá th√¥ng tin
1  GV002   Tr·∫ßn Th·ªã B√≠ch  C√¥ng ngh·ªá th√¥ng tin
2  GV003     L√™ Minh H·∫£i              Kinh t·∫ø
3  GV004  Ph·∫°m Th·ªã Duy√™n  C√¥ng ngh·ªá th√¥ng tin
4  GV005  Ho√†ng VƒÉn Tu·∫•n            Ngo·∫°i ng·ªØ

--- B·∫£ng: MonHoc ---
    MaMH                      TenMH  SoTinChi
0  MH001           L·∫≠p tr√¨nh Python         3
1  MH002              C∆° s·ªü d·ªØ li·ªáu         3
2  MH003              Kinh t·∫ø vƒ© m√¥         2
3  MH004        Ti·∫øng Anh giao ti·∫øp         4
4  MH005  Nh·∫≠p m√¥n Tr√≠ tu·ªá nh√¢n t·∫°o         3

--- B·∫£ng: LopHocPhan ---
  MaLopHP   MaMH   MaGV BuoiHoc          HocKy
0  LHP001  MH001  GV001    S√°ng  HK1_2025-2026
1  LHP002  MH001  GV002   Chi·ªÅu  HK1_2025-2026
2  LHP003  MH002  GV001   Chi·ªÅu  HK1_2025-2026
3  LHP004  MH003  GV003    S√°ng  HK2_2025-2026
4  LHP005  MH004  GV005     T·ªëi  HK1_2025-2026
5  LHP006  MH0

In [2]:
import re
import requests
from vanna.base import VannaBase


class CustomGroq(VannaBase):
    """
    L·ªõp t√πy ch·ªânh ƒë·ªÉ k·∫øt n·ªëi Vanna v·ªõi Groq Cloud API.
    """

    def __init__(self, config=None):
        if config is None or "api_key" not in config:
            raise ValueError("Config ph·∫£i ch·ª©a 'api_key' cho Groq.")

        self.api_key = config["api_key"]
        self.model = config.get("model", "llama3-70b-8192")  # ƒê·∫∑t model m·∫∑c ƒë·ªãnh
        self.temperature = config.get("temperature", 0.7)
        self.base_url = "https://api.groq.com/openai/v1"

    def system_message(self, message: str) -> dict:
        return {"role": "system", "content": message}

    def user_message(self, message: str) -> dict:
        return {"role": "user", "content": message}

    def assistant_message(self, message: str) -> dict:
        return {"role": "assistant", "content": message}

    def submit_prompt(self, prompt, **kwargs) -> str:
        url = f"{self.base_url}/chat/completions"

        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json",
        }

        data = {
            "model": self.model,
            "messages": prompt,
            "temperature": self.temperature,
            "stream": False,
        }

        try:
            response = requests.post(url, headers=headers, json=data)
            # Th√™m d√≤ng n√†y ƒë·ªÉ b√°o l·ªói n·∫øu request th·∫•t b·∫°i (v√≠ d·ª•: 401, 404, 500)
            response.raise_for_status()

            response_dict = response.json()
            return response_dict["choices"][0]["message"]["content"]

        except requests.exceptions.RequestException as e:
            # X·ª≠ l√Ω l·ªói m·ªôt c√°ch r√µ r√†ng
            print(f"L·ªói khi g·ªçi API Groq: {e}")
            if response is not None:
                print(f"Response body: {response.text}")
            return "SELECT 'API_CALL_ERROR' as error;"  # Tr·∫£ v·ªÅ m·ªôt SQL l·ªói ƒë·ªÉ kh√¥ng l√†m s·∫≠p ch∆∞∆°ng tr√¨nh

    def extract_sql_query(self, text: str) -> str:
        # Gi·ªØ nguy√™n logic tr√≠ch xu·∫•t SQL t·ª´ class Vllm
        pattern = re.compile(r"select.*?(?:;|```|$)", re.IGNORECASE | re.DOTALL)
        match = pattern.search(text)
        if match:
            return match.group(0).replace("```", "").strip()
        return text

    def generate_sql(self, question: str, **kwargs) -> str:
            # G·ªçi ph∆∞∆°ng th·ª©c c·ªßa l·ªõp cha ƒë·ªÉ l·∫•y k·∫øt qu·∫£ th√¥ t·ª´ LLM.
            # VannaBase s·∫Ω t·ª± ƒë·ªông lo vi·ªác t·∫°o prompt v√† g·ªçi self.submit_prompt
            sql_from_llm = super().generate_sql(question, **kwargs)

            # B√¢y gi·ªù m·ªõi x·ª≠ l√Ω k·∫øt qu·∫£ tr·∫£ v·ªÅ
            sql = sql_from_llm.replace("\\_", "_").replace("\\", "")
            return self.extract_sql_query(sql)

    def generate_answer(self, question: str, **kwargs) -> str:
        sql = self.generate_sql(question=question, **kwargs)
    
        df = self.run_sql(sql=sql)

        if df is None or df.empty:
            return "Xin l·ªói, t√¥i kh√¥ng t√¨m th·∫•y d·ªØ li·ªáu n√†o ph√π h·ª£p v·ªõi c√¢u h·ªèi c·ªßa b·∫°n."

        prompt = [
            self.system_message(
                "B·∫°n l√† m·ªôt tr·ª£ l√Ω AI. Ng∆∞·ªùi d√πng ƒë√£ h·ªèi m·ªôt c√¢u h·ªèi v√† b·∫°n ƒë√£ c√≥ d·ªØ li·ªáu tr·∫£ v·ªÅ t·ª´ database. "
                "H√£y tr·∫£ l·ªùi c√¢u h·ªèi c·ªßa ng∆∞·ªùi d√πng m·ªôt c√°ch t·ª± nhi√™n d·ª±a tr√™n d·ªØ li·ªáu ƒë∆∞·ª£c cung c·∫•p.\n"
                f"C√¢u h·ªèi c·ªßa ng∆∞·ªùi d√πng l√†: {question}\n\n"
                f"D·ªØ li·ªáu t·ª´ database l√†:\n{df.to_string()}"
            )
        ]

        return self.submit_prompt(prompt)


In [None]:
import vanna as vn
from vanna.openai import OpenAI_Chat
from vanna.chromadb import ChromaDB_VectorStore
# --- B∆Ø·ªöC 1: THI·∫æT L·∫¨P V√Ä K·∫æT N·ªêI ---

# 1.1. Thi·∫øt l·∫≠p L·ªõp Vanna t√πy ch·ªânh
# L·ªõp n√†y cho Vanna bi·∫øt s·∫Ω d√πng OpenAI ƒë·ªÉ x·ª≠ l√Ω ng√¥n ng·ªØ v√† ChromaDB ƒë·ªÉ l∆∞u tr·ªØ d·ªØ li·ªáu training
class MyVanna(ChromaDB_VectorStore, CustomGroq):
    def __init__(self, config=None):
        ChromaDB_VectorStore.__init__(self, config=config)
        CustomGroq.__init__(self, config=config)

config = {
    "api_key": "",
    "model": "llama-3.3-70b-versatile"
}

vn = MyVanna(config=config)

db_path = '../ingest_data/tin_chi.duckdb' 
vn.connect_to_duckdb(url=db_path)

print(f"ƒê√£ k·∫øt n·ªëi th√†nh c√¥ng t·ªõi database: {db_path}")

# 1. Hu·∫•n luy·ªán v·ªÅ c·∫•u tr√∫c (DDL)
vn.train(ddl="""
    CREATE TABLE GiangVien (
        MaGV VARCHAR(10) PRIMARY KEY, TenGV VARCHAR(100) NOT NULL, Khoa VARCHAR(100)
    );
""")
vn.train(ddl="""
    CREATE TABLE MonHoc (
        MaMH VARCHAR(10) PRIMARY KEY, TenMH VARCHAR(150) NOT NULL, SoTinChi INT
    );
""")
vn.train(ddl="""
    CREATE TABLE LopHocPhan (
        MaLopHP VARCHAR(15) PRIMARY KEY, MaMH VARCHAR(10) NOT NULL, MaGV VARCHAR(10) NOT NULL, 
        BuoiHoc VARCHAR(10), HocKy VARCHAR(20)
    );
""")

# 2. Hu·∫•n luy·ªán v·ªÅ thu·∫≠t ng·ªØ v√† quy t·∫Øc (Documentation)
vn.train(documentation="Bu·ªïi s√°ng l√† c√°c l·ªõp h·ªçc di·ªÖn ra v√†o bu·ªïi s√°ng.")
vn.train(documentation="Bu·ªïi chi·ªÅu l√† c√°c l·ªõp h·ªçc di·ªÖn ra v√†o bu·ªïi chi·ªÅu.")
vn.train(documentation="Bu·ªïi t·ªëi l√† c√°c l·ªõp h·ªçc di·ªÖn ra v√†o bu·ªïi t·ªëi.")
vn.train(documentation="H·ªçc k·ª≥ c√≥ ƒë·ªãnh d·∫°ng 'HK{s·ªë}_{nƒÉm h·ªçc}', v√≠ d·ª•: 'HK1_2025-2026'.")
vn.train(documentation="Khi ng∆∞·ªùi d√πng h·ªèi t√™n gi·∫£ng vi√™n kh√¥ng ƒë·∫ßy ƒë·ªß, h√£y t√¨m ki·∫øm g·∫ßn ƒë√∫ng. V√≠ d·ª• 'An' c√≥ th·ªÉ l√† 'Nguy·ªÖn VƒÉn An'.")


# 3. Hu·∫•n luy·ªán c√°c k·ªãch b·∫£n truy v·∫•n (Question-SQL Pairs)

print("  - K·ªãch b·∫£n 1: L·ªçc ƒë∆°n gi·∫£n")
vn.train(
    question="G·ª£i √Ω c√°c l·ªõp h·ªçc v√†o bu·ªïi s√°ng",
    sql="SELECT gv.TenGV, mh.TenMH, lhp.BuoiHoc FROM LopHocPhan AS lhp JOIN GiangVien AS gv ON lhp.MaGV = gv.MaGV JOIN MonHoc AS mh ON lhp.MaMH = mh.MaMH WHERE lhp.BuoiHoc = 'S√°ng'"
)
vn.train(
    question="Ai d·∫°y m√¥n L·∫≠p tr√¨nh Python?",
    sql="SELECT gv.TenGV FROM LopHocPhan AS lhp JOIN GiangVien AS gv ON lhp.MaGV = gv.MaGV JOIN MonHoc AS mh ON lhp.MaMH = mh.MaMH WHERE mh.TenMH = 'L·∫≠p tr√¨nh Python'"
)

print("  - K·ªãch b·∫£n 2: L·ªçc k·∫øt h·ª£p nhi·ªÅu ƒëi·ªÅu ki·ªán")
vn.train(
    question="T√¥i mu·ªën h·ªçc m√¥n L·∫≠p tr√¨nh Python c·ªßa c√¥ B√≠ch",
    sql="SELECT gv.TenGV, mh.TenMH, lhp.BuoiHoc FROM LopHocPhan AS lhp JOIN GiangVien AS gv ON lhp.MaGV = gv.MaGV JOIN MonHoc AS mh ON lhp.MaMH = mh.MaMH WHERE gv.TenGV = 'Tr·∫ßn Th·ªã B√≠ch' AND mh.TenMH = 'L·∫≠p tr√¨nh Python'"
)
vn.train(
    question="C√≥ l·ªõp n√†o c·ªßa khoa C√¥ng ngh·ªá th√¥ng tin v√†o bu·ªïi chi·ªÅu kh√¥ng?",
    sql="SELECT mh.TenMH, gv.TenGV FROM LopHocPhan lhp JOIN MonHoc mh ON lhp.MaMH = mh.MaMH JOIN GiangVien gv ON lhp.MaGV = gv.MaGV WHERE gv.Khoa = 'C√¥ng ngh·ªá th√¥ng tin' AND lhp.BuoiHoc = 'Chi·ªÅu'"
)

print("  - K·ªãch b·∫£n 3: ƒê·∫øm v√† t·ªïng h·ª£p (Aggregation)")
vn.train(
    question="Khoa C√¥ng ngh·ªá th√¥ng tin c√≥ bao nhi√™u gi·∫£ng vi√™n?",
    sql="SELECT COUNT(MaGV) FROM GiangVien WHERE Khoa = 'C√¥ng ngh·ªá th√¥ng tin'"
)
vn.train(
    question="Th·∫ßy Nguy·ªÖn VƒÉn An d·∫°y bao nhi√™u m√¥n?",
    sql="SELECT COUNT(DISTINCT lhp.MaMH) FROM LopHocPhan AS lhp JOIN GiangVien AS gv ON lhp.MaGV = gv.MaGV WHERE gv.TenGV = 'Nguy·ªÖn VƒÉn An'"
)

print("  - K·ªãch b·∫£n 4: X·ª≠ l√Ω t√™n kh√¥ng ƒë·∫ßy ƒë·ªß (Fuzzy Matching)")
vn.train(
    question="G·ª£i √Ω c√°c m√¥n th·∫ßy An d·∫°y",
    sql="SELECT DISTINCT mh.TenMH FROM LopHocPhan AS lhp JOIN GiangVien AS gv ON lhp.MaGV = gv.MaGV JOIN MonHoc AS mh ON lhp.MaMH = mh.MaMH WHERE gv.TenGV LIKE '%An%'"
)
vn.train(
    question="T√¥i th√≠ch h·ªçc bu·ªïi chi·ªÅu, c√≥ l·ªõp n√†o c·ªßa c√¥ Duy√™n kh√¥ng?",
    sql="SELECT mh.TenMH, gv.TenGV FROM LopHocPhan AS lhp JOIN GiangVien AS gv ON lhp.MaGV = gv.MaGV JOIN MonHoc AS mh ON lhp.MaMH = mh.MaMH WHERE gv.TenGV LIKE '%Duy√™n%' AND lhp.BuoiHoc = 'Chi·ªÅu'"
)

print("  - K·ªãch b·∫£n 5: Truy v·∫•n theo h·ªçc k·ª≥")
vn.train(
    question="C√°c m√¥n trong h·ªçc k·ª≥ 2 nƒÉm h·ªçc 2025-2026",
    sql="SELECT mh.TenMH, gv.TenGV FROM LopHocPhan lhp JOIN MonHoc mh ON lhp.MaMH = mh.MaMH JOIN GiangVien gv ON lhp.MaGV = gv.MaGV WHERE lhp.HocKy = 'HK2_2025-2026'"
)

print("‚úÖ Hu·∫•n luy·ªán to√†n di·ªán ho√†n t·∫•t!")




True
ƒê√£ k·∫øt n·ªëi th√†nh c√¥ng t·ªõi database: ../ingest_data/tin_chi.duckdb
Adding ddl: 
    CREATE TABLE GiangVien (
        MaGV VARCHAR(10) PRIMARY KEY, TenGV VARCHAR(100) NOT NULL, Khoa VARCHAR(100)
    );

Adding ddl: 
    CREATE TABLE MonHoc (
        MaMH VARCHAR(10) PRIMARY KEY, TenMH VARCHAR(150) NOT NULL, SoTinChi INT
    );

Adding ddl: 
    CREATE TABLE LopHocPhan (
        MaLopHP VARCHAR(15) PRIMARY KEY, MaMH VARCHAR(10) NOT NULL, MaGV VARCHAR(10) NOT NULL, 
        BuoiHoc VARCHAR(10), HocKy VARCHAR(20)
    );

Adding documentation....
Adding documentation....
Adding documentation....
Adding documentation....
Adding documentation....
  - K·ªãch b·∫£n 1: L·ªçc ƒë∆°n gi·∫£n
  - K·ªãch b·∫£n 2: L·ªçc k·∫øt h·ª£p nhi·ªÅu ƒëi·ªÅu ki·ªán
  - K·ªãch b·∫£n 3: ƒê·∫øm v√† t·ªïng h·ª£p (Aggregation)
  - K·ªãch b·∫£n 4: X·ª≠ l√Ω t√™n kh√¥ng ƒë·∫ßy ƒë·ªß (Fuzzy Matching)
  - K·ªãch b·∫£n 5: Truy v·∫•n theo h·ªçc k·ª≥
‚úÖ Hu·∫•n luy·ªán to√†n di·ªán ho√†n t·∫•t!


In [4]:
# --- B√¢y gi·ªù b·∫°n c√≥ th·ªÉ b·∫Øt ƒë·∫ßu h·ªèi ---
print("\nB√¢y gi·ªù b·∫°n c√≥ th·ªÉ b·∫Øt ƒë·∫ßu h·ªèi Vanna:")

# Danh s√°ch c√°c c√¢u h·ªèi ƒë·ªÉ test
test_questions = [
    "Ai d·∫°y m√¥n Kinh t·∫ø vƒ© m√¥?",
    "Th·∫ßy An c√≥ d·∫°y l·ªõp n√†o v√†o bu·ªïi chi·ªÅu kh√¥ng?",
    "C√¥ B√≠ch d·∫°y t·ªïng c·ªông bao nhi√™u m√¥n?",
    "T√¨m c√°c l·ªõp c·ªßa c√¥ Duy√™n.",
    "Ngo√†i th·∫ßy An, c√≤n ai d·∫°y m√¥n C∆° s·ªü d·ªØ li·ªáu kh√¥ng?"
]

# V√≤ng l·∫∑p qua t·ª´ng c√¢u h·ªèi, l·∫•y c√¢u tr·∫£ l·ªùi v√† in ra
for question in test_questions:
    print(f"‚ùì User: {question}")
    answer = vn.generate_answer(question=question)
    print(f"ü§ñ Vanna: {answer}\n")


B√¢y gi·ªù b·∫°n c√≥ th·ªÉ b·∫Øt ƒë·∫ßu h·ªèi Vanna:
‚ùì User: Ai d·∫°y m√¥n Kinh t·∫ø vƒ© m√¥?
SQL Prompt: [{'role': 'system', 'content': "You are a DuckDB SQL expert. Please help to generate a SQL query to answer the question. Your response should ONLY be based on the given context and follow the response guidelines and format instructions. \n===Tables \n\n    CREATE TABLE GiangVien (\n        MaGV VARCHAR(10) PRIMARY KEY, TenGV VARCHAR(100) NOT NULL, Khoa VARCHAR(100)\n    );\n\n\n\n    CREATE TABLE MonHoc (\n        MaMH VARCHAR(10) PRIMARY KEY, TenMH VARCHAR(150) NOT NULL, SoTinChi INT\n    );\n\n\n\n    CREATE TABLE LopHocPhan (\n        MaLopHP VARCHAR(15) PRIMARY KEY, MaMH VARCHAR(10) NOT NULL, MaGV VARCHAR(10) NOT NULL, \n        BuoiHoc VARCHAR(10), HocKy VARCHAR(20)\n    );\n\n\n\n===Additional Context \n\nKhi ng∆∞·ªùi d√πng h·ªèi t√™n gi·∫£ng vi√™n kh√¥ng ƒë·∫ßy ƒë·ªß, h√£y t√¨m ki·∫øm g·∫ßn ƒë√∫ng. V√≠ d·ª• 'An' c√≥ th·ªÉ l√† 'Nguy·ªÖn VƒÉn An'.\n\nBu·ªïi t·ªëi l√† c√°c l