In [None]:
import os
import torch
import faiss
import logging
from Libraries import C1_CreateSchema as C1, C2_Embedding as C2, C3_CheckStruct as C3
from Config import Widgets, Configs, ModelLoader as ML

In [None]:
widgets_list = Widgets.create_name_form()

In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"

In [None]:
config = Configs.WidgetValues(widgets_list)

data_foler = config["data_folder"]
dcmt_path = config["dcmt_path"]
base_folder = config["base_folder"]
base_path = config["base_path"]
extracted_path = config["extracted_path"]
merged_path = config["merged_path"]
struct_path = config["struct_path"]
chunks_base = config["chunks_base"]
chunks_segment = config["chunks_segment"]
schema_ex_path = config["schema_ex_path"]
embedding_path = config["embedding_path"]
torch_path = config["torch_path"]
faiss_path = config["faiss_path"]
mapping_path = config["mapping_path"]
map_data_path = config["map_data_path"]
meta_path = config["meta_path"]

FILE_TYPE = config["FILE_TYPE"]
DATA_KEY = config["DATA_KEY"]
EMBE_KEY = config["EMBE_KEY"]
SWITCH = config["SWITCH"]
EMBEDD_MODEL = config["EMBEDD_MODEL"]
SEARCH_EGINE = config["SEARCH_EGINE"]
RERANK_MODEL = config["RERANK_MODEL"]
RESPON_MODEL = config["RESPON_MODEL"]
API_KEY = config["API_KEY"]

WORD_LIMIT = config["WORD_LIMIT"]

SEARCH_ENGINE = faiss.IndexFlatIP

## Prepare

### Fixed

In [None]:
# loaded model
cached_path = "../Models"

### Device

In [None]:
ML.CudaCheck()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if SWITCH == "Auto Model":
    if os.path.exists(cached_path):
        tokenizer, model = ML.load_auto_model(cached_path, device)
        print(f"ℹ️ Auto Model: {cached_path}")
        if model is None:
            tokenizer, model = ML.load_auto_model(EMBEDD_MODEL, device)
    else:
        print(f"ℹ️ Auto Model: {EMBEDD_MODEL}")
        tokenizer, model = ML.load_auto_model(EMBEDD_MODEL, device)

elif SWITCH == "Sentence Transformer":
    if os.path.exists(cached_path):
        model = ML.load_sentence_model(cached_path, device)
        print(f"ℹ️ Sentece Transformer: {cached_path}")

        if model is None:
            model = ML.load_sentence_model(EMBEDD_MODEL, device)
    else:
        print(f"ℹ️ Sentece Transformer: {EMBEDD_MODEL}")
        model = ML.load_sentence_model(EMBEDD_MODEL, device)

print(f"✅ Using: {device}")

### Import Classes

In [None]:
schemaEx = C1.JSONSchemaExtractor(
    list_policy="first", 
    verbose=True
)

In [None]:
Embedding = C2.JSONEmbedding(
    model=model,
    device="cuda:0",
    batch_size=32,
    show_progress=False,
    flatten_mode="split"
)

In [None]:
FaissConverter = D0.Torch2FaissConverter(
    schema_ex_path=schema_ex_path,
    torch_path=torch_path,
    faiss_path=faiss_path,
    mapping_path=mapping_path,
    map_data_path=map_data_path,
    keep_last=2,
    nlist=100,
    mode=EMBE_KEY,
    use_pickle=False
)

### Schema Extract

In [None]:
def schemaRun():
    if os.path.exists(chunks_segment):
        schemaEx.schemaRun(chunks_segment, schema_path=schema_ex_path)
        chunksSchema = A0.read_json(schema_ex_path)
        print(chunksSchema)
    else:
        print(f"{chunks_segment} does not exist")

### Embedding

In [None]:
def embeddingRun():
    if os.path.exists(chunks_segment):
        Embedding.embeddingRun(
            json_path = chunks_segment,
            schema_path = schema_ex_path,
            torch_path = torch_path,
            data_key = DATA_KEY,
            embe_key = EMBE_KEY,
            skip_if_exists = False,
        )
        
        C3.print_json(DATA_KEY, EMBE_KEY, torch_path)
    
    else:
        print(f"{chunks_segment} does not exist")

### Run

In [None]:
schemaRun()
embeddingRun()
FaissConverter.convert()