In [None]:
import fitz, os, faiss
from sentence_transformers import CrossEncoder

from Config import Configs
from Config import ModelLoader as ML

from Libraries import Common_MyUtils as MU
from Libraries import PDF_ExtractData as ExtractData, PDF_MergeData as MergeData
from Libraries import Json_GetStructures as GetStructures, Json_ChunkMaster as ChunkMaster, Json_SchemaExt as SchemaExt
from Libraries import Faiss_Embedding as F_Embedding, Faiss_Searching as F_Searching

## CONFIGURATIONS

#### SERVICES

In [None]:
Checkpoint = "vinai/bartpho-syllable"
service = "HNMU"
inputs = "HNMU.pdf"
JsonKey = "paragraphs"
JsonField = "Text"

#### PATHS & MODELS

In [None]:
config = Configs.ConfigValues(service=service, inputs=inputs)
inputPath = config["inputPath"]
PdfPath = config["PdfPath"]
DocPath = config["DocPath"]
exceptPath = config["exceptPath"]
markerPath = config["markerPath"]
statusPath = config["statusPath"]
RawDataPath = config["RawDataPath"]
RawLvlsPath = config["RawLvlsPath"]
StructsPath = config["StructsPath"]
SegmentPath = config["SegmentPath"]
SchemaPath = config["SchemaPath"]
FaissPath = config["FaissPath"]
MappingPath = config["MappingPath"]
MapDataPath = config["MapDataPath"]
MapChunkPath = config["MapChunkPath"]
MetaPath = config["MetaPath"]
DATA_KEY = config["DATA_KEY"]
EMBE_KEY = config["EMBE_KEY"]
SEARCH_EGINE = config["SEARCH_EGINE"]
RERANK_MODEL = config["RERANK_MODEL"]
RESPON_MODEL = config["RESPON_MODEL"]
EMBEDD_MODEL = config["EMBEDD_MODEL"]
CHUNKS_MODEL = config["CHUNKS_MODEL"]
SUMARY_MODEL = config["SUMARY_MODEL"]
WORD_LIMIT = config["WORD_LIMIT"]

MODEL_DIR = "Models"
MODEL_TYPE = "Sentence_Transformer"
EMBEDD_CACHED_MODEL = f"{MODEL_DIR}/{MODEL_TYPE}/{EMBEDD_MODEL}"
CHUNKS_CACHED_MODEL = F"{MODEL_DIR}/{MODEL_TYPE}/{CHUNKS_MODEL}"
SUMARY_CACHED_MODEL = f"{MODEL_DIR}/{MODEL_TYPE}/{SUMARY_MODEL}"

MAX_INPUT = 1024
MAX_TARGET = 256
MIN_TARGET = 64
TRAIN_EPOCHS = 3
LEARNING_RATE = 3e-5
WEIGHT_DECAY = 0.01
BATCH_SIZE = 4


#### DEVICE

In [None]:
indexer, embeddDevice = ML.init_sentence_model(EMBEDD_MODEL, EMBEDD_CACHED_MODEL)
chunker, chunksDevice = ML.init_sentence_model(CHUNKS_MODEL, CHUNKS_CACHED_MODEL)

In [None]:
faissIndexer = F_Embedding.DirectFaissIndexer(
    indexer=indexer,
    device=str(embeddDevice),
    batch_size=32,
    show_progress=True,
    flatten_mode="split",
    join_sep="\n",
    allowed_schema_types=("string", "array", "dict"),
    max_chars_per_text=2000,
    normalize=True,
    verbose=False
)

#### LOAD EXISTED DATA

In [None]:
def runPrepareData():
    Mapping = MU.read_json(MappingPath)
    MapData = MU.read_json(MapDataPath)
    
    MapChunk = MU.read_json(MapChunkPath)
    faissIndex = faiss.read_index(FaissPath)
    return Mapping, MapData, MapChunk, faissIndex

### RUN PREPARING

In [None]:
Mapping, MapData, MapChunk, faissIndex = runPrepareData()

## QUERYING

#### IMPORT CLASSES

In [None]:
reranker = CrossEncoder(RERANK_MODEL, device=str(embeddDevice))
engine = F_Searching.SemanticSearchEngine(
    indexer=indexer,
    reranker=reranker,
    device=str(embeddDevice),
    normalize=True,
    top_k=20,
    rerank_k=10,
    rerank_batch_size=16
)

#### SEARCH

In [None]:
def runSearch(query):
    results = engine.search(
        query=query,
        faissIndex=faissIndex,
        Mapping=Mapping,
        MapData=MapData,
        top_k=20
    )
    return results

#### RERANK

In [None]:
def runRerank(query, results):
    reranked = engine.rerank(
        query=query,
        results=results,
        top_k=10
    )
    return reranked

#### RESPONSE

In [None]:
query = "Cách đánh giá kết quả học tập của sinh viên như thế nào?"

In [None]:
resuls = runSearch(query)
reranked = runRerank(query, resuls)

for i, item in enumerate(reranked):
    print(f"{i}. (Score: {item['rerank_score']:.4f}) {item['text']}")


In [None]:
data = MU.read_json("Config/APIs.json")
API_KEY = data["APIs"][0]
# print(API_KEY)