In [1]:
import fitz, os, faiss
import re
import pandas as pd
from transformers import pipeline
from Config import Configs
from Config import ModelLoader as ML
from Libraries import Common_MyUtils as MU, Common_TextProcess as TP
from Libraries import PDF_ExtractData as ExtractData, PDF_MergeData as MergeData, Json_ChunkUnder as ChunkUnder
from Libraries import Faiss_Embedding as F_Embedding, Faiss_Searching as F_Searching
from Libraries import Summarizer_Runner as SummaryRun
from sentence_transformers import CrossEncoder

In [2]:
Checkpoint = "vinai/bartpho-syllable"
service = "Categories"
inputs = "DP.pdf"
JsonKey = "paragraphs"
JsonField = "Text"

In [3]:
config = Configs.ConfigValues(service=service, inputs=inputs)
inputPath = config["inputPath"]
PdfPath = config["PdfPath"]
DocPath = config["DocPath"]
exceptPath = config["exceptPath"]
markerPath = config["markerPath"]
statusPath = config["statusPath"]
RawDataPath = config["RawDataPath"]
RawLvlsPath = config["RawLvlsPath"]
StructsPath = config["StructsPath"]
SegmentPath = config["SegmentPath"]
SchemaPath = config["SchemaPath"]
FaissPath = config["FaissPath"]
MappingPath = config["MappingPath"]
MapDataPath = config["MapDataPath"]
MapChunkPath = config["MapChunkPath"]
MetaPath = config["MetaPath"]
DATA_KEY = config["DATA_KEY"]
EMBE_KEY = config["EMBE_KEY"]
SEARCH_EGINE = config["SEARCH_EGINE"]
RERANK_MODEL = config["RERANK_MODEL"]
RESPON_MODEL = config["RESPON_MODEL"]
EMBEDD_MODEL = config["EMBEDD_MODEL"]
CHUNKS_MODEL = config["CHUNKS_MODEL"]
SUMARY_MODEL = config["SUMARY_MODEL"]
WORD_LIMIT = config["WORD_LIMIT"]

MODEL_DIR = "Models"
MODEL_ENCODE = "Sentence_Transformer"
MODEL_SUMARY = "Summarizer"
EMBEDD_CACHED_MODEL = f"{MODEL_DIR}/{MODEL_ENCODE}/{EMBEDD_MODEL}"
CHUNKS_CACHED_MODEL = F"{MODEL_DIR}/{MODEL_ENCODE}/{CHUNKS_MODEL}"
SUMARY_CACHED_MODEL = f"{MODEL_DIR}/{MODEL_SUMARY}/{SUMARY_MODEL}"

MAX_INPUT = 1024
MAX_TARGET = 256
MIN_TARGET = 64
TRAIN_EPOCHS = 3
LEARNING_RATE = 3e-5
WEIGHT_DECAY = 0.01
BATCH_SIZE = 4

In [4]:
def loadHardcodes(file_path, wanted=None):
    data = MU.read_json(file_path)
    if "items" not in data:
        return
    result = {}
    for item in data["items"]:
        key = item["key"]
        if (not wanted) or (key in wanted):
            result[key] = item["values"]
    return result

In [5]:
exceptData = loadHardcodes(exceptPath, wanted=["common_words", "proper_names", "abbreviations"])
markerData = loadHardcodes(markerPath, wanted=["keywords", "markers"])
statusData = loadHardcodes(statusPath, wanted=["brackets", "sentence_ends"])

In [6]:
Loader = ML.ModelLoader()
indexer, embeddDevice = Loader.load_encoder(EMBEDD_MODEL, EMBEDD_CACHED_MODEL)
chunker, chunksDevice = Loader.load_encoder(CHUNKS_MODEL, CHUNKS_CACHED_MODEL)

tokenizer, summarizer, summaryDevice = Loader.load_summarizer(SUMARY_MODEL, SUMARY_CACHED_MODEL)

2025-10-19 13:31:04,048 - INFO - Load pretrained SentenceTransformer: Models/Sentence_Transformer/VoVanPhuc/sup-SimCSE-VietNamese-phobert-base



🔍 Loading SentenceTransformer (VoVanPhuc/sup-SimCSE-VietNamese-phobert-base) on cuda ...
CUDA supported: True
Number of GPUs: 1
Current GPU: NVIDIA GeForce RTX 2050
Capability: (8, 6)
CUDA version (PyTorch): 12.1
cuDNN version: 90100


2025-10-19 13:31:04,799 - INFO - Load pretrained SentenceTransformer: Models/Sentence_Transformer/paraphrase-multilingual-MiniLM-L12-v2


📂 Loaded from cache: Models/Sentence_Transformer/VoVanPhuc/sup-SimCSE-VietNamese-phobert-base
✅ SentenceTransformer ready.

🔍 Loading SentenceTransformer (paraphrase-multilingual-MiniLM-L12-v2) on cuda ...
CUDA supported: True
Number of GPUs: 1
Current GPU: NVIDIA GeForce RTX 2050
Capability: (8, 6)
CUDA version (PyTorch): 12.1
cuDNN version: 90100
📂 Loaded from cache: Models/Sentence_Transformer/paraphrase-multilingual-MiniLM-L12-v2
✅ SentenceTransformer ready.

🔍 Initializing summarizer (vinai/bartpho-syllable) on cuda ...
CUDA supported: True
Number of GPUs: 1
Current GPU: NVIDIA GeForce RTX 2050
Capability: (8, 6)
CUDA version (PyTorch): 12.1
cuDNN version: 90100
📂 Loading summarizer from local cache...
✅ Summarizer ready on cuda


In [7]:
Mapping = MU.read_json(MappingPath)
MapData = MU.read_json(MapDataPath)
MapChunk = MU.read_json(MapChunkPath)
faissIndex = faiss.read_index(FaissPath)

In [8]:
dataExtractor = ExtractData.B1Extractor(
    exceptData,
    markerData,
    statusData,
    proper_name_min_count=10
)

chunkUnder = ChunkUnder.ChunkUndertheseaBuilder(
    embedder=indexer,
    device=embeddDevice,
    min_words=256,
    max_words=768,
    sim_threshold=0.7,
    key_sent_ratio=0.4
)

summarizer_engine = SummaryRun.RecursiveSummarizer(
    tokenizer=tokenizer,
    summarizer=summarizer,
    sum_device=summaryDevice,
    chunk_builder=chunkUnder,
    max_length=200,
    min_length=100,
    max_depth=4
)

reranker = CrossEncoder(RERANK_MODEL, device=str(embeddDevice))
searchEngine = F_Searching.SemanticSearchEngine(
    indexer=indexer,
    reranker=reranker,
    device=str(embeddDevice),
    normalize=True,
    top_k=20,
    rerank_k=10,
    rerank_batch_size=16
)



In [9]:
def extractRun(pdf_doc):
    extractedData = dataExtractor.extract(pdf_doc)
    RawDataDict = MergeData.mergeLinesToParagraphs(extractedData)
    return RawDataDict

In [10]:
def runSearch(query):
    results = searchEngine.search(
        query=query,
        faissIndex=faissIndex,
        Mapping=Mapping,
        MapData=MapData,
        top_k=20
    )
    return results

In [11]:
def runRerank(query, results):
    reranked = searchEngine.rerank(
        query=query,
        results=results,
        top_k=10
    )
    return reranked

In [None]:
pdf_doc = fitz.open(inputPath)
RawDataDict = extractRun(pdf_doc)
pdf_doc.close()

full_text = TP.merge_txt(RawDataDict, JsonKey, JsonField)
word_count = len(full_text.split())

print(f"\nTổng số từ trích xuất: {word_count}")

summarized = summarizer_engine.summarize(full_text, minInput = 256, maxInput = 1024)
print(summarized["summary_text"])



Tổng số từ trích xuất: 443


In [None]:
resuls = runSearch(summarized["summary_text"])
reranked = runRerank(summarized["summary_text"], resuls)

best_text = reranked[0]["text"] if reranked else ""
print(best_text)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Thuật toán
