In [None]:
import os
import faiss
import logging
from Libraries import A0_MyUtils as A0, A1_TextProcess as A1, A2_PdfProcess as A2
from Libraries import B1_ExtractData as B1, B2_MergeData as B2, B3_GetStructures as B3
from Libraries import B4_ChunkMaster as B4, B5_ChunkFlex as B5, B6_ChunkFixed as B6
from Config import Widgets, Configs, ModelLoader as ML

In [None]:
widgets_list = Widgets.create_name_form()

In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"

In [None]:
config = Configs.WidgetValues(widgets_list)

data_foler = config["data_folder"]
dcmt_path = config["dcmt_path"]
base_folder = config["base_folder"]
base_path = config["base_path"]
extracted_path = config["extracted_path"]
merged_path = config["merged_path"]
struct_path = config["struct_path"]
chunks_struct = config["chunks_struct"]
chunks_segment = config["chunks_segment"]
schema_ex_path = config["schema_ex_path"]
embedding_path = config["embedding_path"]
torch_path = config["torch_path"]
faiss_path = config["faiss_path"]
mapping_path = config["mapping_path"]
map_data_path = config["map_data_path"]
meta_path = config["meta_path"]

FILE_TYPE = config["FILE_TYPE"]
DATA_KEY = config["DATA_KEY"]
EMBE_KEY = config["EMBE_KEY"]
SWITCH = config["SWITCH"]
EMBEDD_MODEL = config["EMBEDD_MODEL"]
SEARCH_EGINE = config["SEARCH_EGINE"]
RERANK_MODEL = config["RERANK_MODEL"]
RESPON_MODEL = config["RESPON_MODEL"]
API_KEY = config["API_KEY"]

WORD_LIMIT = config["WORD_LIMIT"]

SEARCH_ENGINE = faiss.IndexFlatIP

## Prepare

### Fixed

In [None]:
# assets
assets = "../Assets/"
exceptions_path = f"{assets}ex.exceptions.json"
markers_path = f"{assets}ex.markers.json"
status_path = f"{assets}ex.status.json"

### Import Classes

In [None]:
dataExtractor = B1.B1Extractor(
    exceptions_path, 
    markers_path, 
    status_path, 
    proper_name_min_count=10
)

In [None]:
structAnalyzer = B3.StructureAnalyzer(
    merged_path = merged_path,
    verbose=True
)

In [None]:
chunkBuilder = B4.ChunkBuilder(
    struct_path=struct_path,
    merged_path=merged_path,
)

### Extract Data

In [None]:
def extractRun():
    extracted_data = dataExtractor.extract(dcmt_path)
    A0.write_json(extracted_data, extracted_path, indent=1)

    merged_data = B2.mergeLinesToParagraphs(extracted_data)
    A0.write_json(merged_data, merged_path, indent=1)

### Get Struct

In [None]:
def structRun():
    markers = structAnalyzer.extract_markers()

    structures = structAnalyzer.build_structures(markers)
    print(A0.jsonl_convert(structures))

    dedup = structAnalyzer.deduplicate(structures)
    print(A0.jsonl_convert(dedup))

    top = structAnalyzer.select_top(dedup)
    topext = structAnalyzer.extend_top(top, dedup)
    print(A0.json_convert(topext, pretty=True))

    A0.write_json(topext, struct_path, indent=2)

### Chunks

In [None]:
def chunkRun():
    chunks = chunkBuilder.build()
    A0.write_json(chunks, chunks_base, indent=2)

    filtered = [item for item in chunks if item.get("Level 1", "").strip()]
    for i, item in enumerate(filtered, start=1):
        item["Index"] = i

    A0.write_json(filtered, chunks_segment, indent=2)

### Run

In [None]:
if FILE_TYPE == "Data":
    extractRun()
    structRun()
    chunkRun()