In [None]:
import faiss

from Config import Configs
from Config import ModelLoader as ML

from Libraries import Common_MyUtils as MU

from Libraries import Faiss_Embedding as F_Embedding, Faiss_Searching as F_Searching, Faiss_ChunkMapping as ChunkMapper
from sentence_transformers import CrossEncoder

## CONFIGURATIONS

#### SERVICES

In [None]:
Checkpoint = "vinai/bartpho-syllable"
service = "Categories"
inputs = "HNMU.pdf"
JsonKey = "paragraphs"
JsonField = "Text"

#### PATHS & MODELS

In [None]:
config = Configs.ConfigValues(service=service, inputs=inputs)
inputPath = config["inputPath"]
PdfPath = config["PdfPath"]
DocPath = config["DocPath"]
exceptPath = config["exceptPath"]
markerPath = config["markerPath"]
statusPath = config["statusPath"]
RawDataPath = config["RawDataPath"]
RawLvlsPath = config["RawLvlsPath"]
StructsPath = config["StructsPath"]
SegmentPath = config["SegmentPath"]
SchemaPath = config["SchemaPath"]
FaissPath = config["FaissPath"]
MappingPath = config["MappingPath"]
MapDataPath = config["MapDataPath"]
MapChunkPath = config["MapChunkPath"]
MetaPath = config["MetaPath"]
DATA_KEY = config["DATA_KEY"]
EMBE_KEY = config["EMBE_KEY"]
SEARCH_EGINE = config["SEARCH_EGINE"]
RERANK_MODEL = config["RERANK_MODEL"]
RESPON_MODEL = config["RESPON_MODEL"]
EMBEDD_MODEL = config["EMBEDD_MODEL"]
CHUNKS_MODEL = config["CHUNKS_MODEL"]
SUMARY_MODEL = config["SUMARY_MODEL"]
WORD_LIMIT = config["WORD_LIMIT"]

MODEL_DIR = "Models"
MODEL_ENCODE = "Sentence_Transformer"
MODEL_SUMARY = "Summarizer"
EMBEDD_CACHED_MODEL = f"{MODEL_DIR}/{MODEL_ENCODE}/{EMBEDD_MODEL}"
CHUNKS_CACHED_MODEL = F"{MODEL_DIR}/{MODEL_ENCODE}/{CHUNKS_MODEL}"
SUMARY_CACHED_MODEL = f"{MODEL_DIR}/{MODEL_SUMARY}/{SUMARY_MODEL}"

MAX_INPUT = 1024
MAX_TARGET = 256
MIN_TARGET = 64
TRAIN_EPOCHS = 3
LEARNING_RATE = 3e-5
WEIGHT_DECAY = 0.01
BATCH_SIZE = 4

#### DEVICE

In [None]:
Loader = ML.ModelLoader()
indexer, embeddDevice = Loader.load_encoder(EMBEDD_MODEL, EMBEDD_CACHED_MODEL)
chunker, chunksDevice = Loader.load_encoder(CHUNKS_MODEL, CHUNKS_CACHED_MODEL)

In [None]:
faissIndexer = F_Embedding.DirectFaissIndexer(
    indexer=indexer,
    device=str(embeddDevice),
    batch_size=32,
    show_progress=True,
    flatten_mode="split",
    join_sep="\n",
    allowed_schema_types=("string", "array", "dict"),
    max_chars_per_text=2000,
    normalize=True,
    verbose=False
)

#### LOAD EXISTED DATA

In [None]:
def runPrepareData():
    SegmentDict = MU.read_json(SegmentPath)
    Mapping = MU.read_json(MappingPath)
    MapData = MU.read_json(MapDataPath)
    
    MapChunk = MU.read_json(MapChunkPath)
    faissIndex = faiss.read_index(FaissPath)
    return SegmentDict, Mapping, MapData, MapChunk, faissIndex

### RUN PREPARING

In [None]:
SegmentDict, Mapping, MapData, MapChunk, faissIndex = runPrepareData()

## QUERYING

#### IMPORT CLASSES

In [None]:
reranker = CrossEncoder(RERANK_MODEL, device=str(embeddDevice))
engine = F_Searching.SemanticSearchEngine(
    indexer=indexer,
    reranker=reranker,
    device=str(embeddDevice),
    normalize=True,
    top_k=20,
    rerank_k=10,
    rerank_batch_size=16
)

#### SEARCH

In [None]:
def runSearch(query):
    results = engine.search(
        query=query,
        faissIndex=faissIndex,
        Mapping=Mapping,
        MapData=MapData,
        MapChunk=MapChunk,
        top_k=20
    )
    return results

#### RERANK

In [None]:
def runRerank(query, results):
    reranked = engine.rerank(
        query=query,
        results=results,
        top_k=10
    )
    return reranked

#### RESPONSE

In [None]:
query = "Thuật ngữ“quy hoạch động” (Dynamic Programming –DP) do Richard Bellman đặt ra vào thập niên 1950. Từ'programming' ởđây không mang nghĩa 'lập trình máy tính', mà là 'lập kếhoạch' (planning). “Dynamic” nghĩa là 'động', ám chỉquá trình tối ưu liên tiếp qua các giai đoạn. Do đó, quy hoạch động nghĩa là **lập kếhoạch tối ưu theo từng bước, dựa trên kết quảcủa các bước trước đó**"

In [None]:
resuls = runSearch(query)
reranked = runRerank(query, resuls)

for i, item in enumerate(reranked):
    print(f"{i}. (Score: {item['rerank_score']:.4f}) {item['text']}")


In [None]:
data = MU.read_json("Config/APIs.json")
API_KEY = data["APIs"][0]
# print(API_KEY)

In [None]:
chunkReturn = ChunkMapper.process_chunks_pipeline(
    reranked_results=reranked,
    SegmentDict=SegmentDict,
    drop_fields=["Index"],          # 1) Trường bị bỏ qua (áp dụng toàn bộ). None → không bỏ
    fields=["Article"],             # 2) Trường muốn trả cho mỗi chunk. None → tất cả top-level còn lại
    n_chunks=1,                     # 3) Số lượng chunk gốc được trả về. None → tất cả
)

In [None]:
print(chunkReturn["chunks_text"])

In [None]:
bestArticles = [item["fields"].get("Article") for item in chunkReturn["extracted_fields"]]
bestArticle = bestArticles[0] if len(bestArticles) == 1 else ", ".join(bestArticles)

print(bestArticle)