In [None]:
from RAGLibrary import myWidgets, myRAG, checkConstruct, createSchema, faissConvert, embedding
import os
import json
import torch
import faiss
import logging
from typing import Any, Dict
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer

In [None]:
widgets_list = myWidgets.create_name_form()

In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
force_download = True

In [None]:
""" DEFINE """

data   = widgets_list[0] #HBox 1
keys   = widgets_list[1] #HBox 2
choose = widgets_list[2] #HBox 3

embedd_model = widgets_list[3]
search_egine = widgets_list[4]
rerank_model = widgets_list[5]
respon_model = widgets_list[6]
API_drop     = widgets_list[7]
button_box   = widgets_list[8]

# HBox 1
file_name = data.children[0]
file_type = data.children[1]

# HBox 2
data_key = keys.children[0]
embe_key = keys.children[1]

# HBox 3
switch_model = choose.children[0]
merge_otp    = choose.children[1]
path_end_val = choose.children[1]

# Get value
data_folder   = file_name.value
file_type_val = file_type.value

data_key_val  = data_key.value
embe_key_val  = embe_key.value

API_key_val = API_drop.value
switch      = switch_model.value
merge       = merge_otp.value
path_end    = path_end_val.value

embedding_model = embedd_model.value
searching_egine = search_egine.value
reranking_model = rerank_model.value
responing_model = respon_model.value


# Define
base_path = f"../Data/{data_folder}/{file_type_val}_{data_folder}"

json_file_path = f"{base_path}_Database.json"
schema_ex_path = f"{base_path}_Schema.json"
embedding_path = f"{base_path}_Embeds_{merge}"

torch_path  = f"{embedding_path}.pt"
faiss_path  = f"{embedding_path}.faiss"
mapping_path = f"{embedding_path}_mapping.json"
mapping_data = f"{embedding_path}_map_data.json"

FILE_TYPE    = file_type_val
DATA_KEY     = data_key_val
EMBE_KEY     = embe_key_val
SWITCH       = switch
EMBEDD_MODEL = embedding_model
SEARCH_EGINE = searching_egine
RERANK_MODEL = reranking_model
RESPON_MODEL = responing_model

if FILE_TYPE == "Data":
    MERGE = merge
else: 
    MERGE = "no_Merge"

API_KEY = API_key_val

SEARCH_ENGINE = faiss.IndexFlatIP

print("\n")
print(f"Embedder: {EMBEDD_MODEL}")
print(f"Searcher: {SEARCH_EGINE}")
print(f"Reranker: {RERANK_MODEL}")
print(f"Responer: {RESPON_MODEL}")
print(f"Data Key: {DATA_KEY}")
print(f"Embe Key: {EMBE_KEY}")
print(f"Database: {json_file_path}")
print(f"Torch   : {torch_path}")
print(f"Faiss   : {faiss_path}")
print(f"Mapping : {mapping_path}")
print(f"Map Data: {mapping_data}")
print(f"Schema  : {schema_ex_path}")
print(f"Model   : {SWITCH}")
print(f"Merge   : {MERGE}")
print(f"API Key : {API_KEY}")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if (SWITCH == "Auto Model"):
    try:
        tokenizer = AutoTokenizer.from_pretrained(EMBEDD_MODEL, force_download=force_download)
        model = AutoModel.from_pretrained(EMBEDD_MODEL, force_download=force_download)
        model = model.to(device)
        print("Model and tokenizer loaded successfully")
    except Exception as e:
        raise
elif (SWITCH == "Sentence Transformer"):
    try:
        # model = SentenceTransformer(EMBEDD_MODEL).to(device)
        model = SentenceTransformer("../../cached_model")
        print("SentenceTransformer loaded successfully")
    except Exception as e:
        raise

print(f"Using: {device}")

In [None]:
""" PREPROCESS TEXT """

def preprocess_text(text):
    import re
    if isinstance(text, list):
        return [preprocess_text(t) for t in text]
    if isinstance(text, str):
        text = text.strip()
        text = re.sub(r'[^\w\s\(\)\.\,\;\:\-–]', '', text)
        text = re.sub(r'[ ]{2,}', ' ', text)
        return text
    return text

In [None]:
""" PREPROCESS DATA """

def preprocess_data(data):
    if isinstance(data, dict):
        return {key: preprocess_data(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [preprocess_data(item) for item in data]
    else:
        return preprocess_text(data)

In [None]:
""" LOAD SCHEMA """

def load_schema(schema_path: str) -> Dict[str, str]:
    try:
        with open(schema_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except FileNotFoundError:
        print(f"Schema file not found: {schema_path}")
        return {}
    except json.JSONDecodeError:
        print(f"Invalid schema file: {schema_path}")
        return {}

In [None]:
""" FLATTEN JSON """

def flatten_json(data: Any, prefix: str = "", schema: Dict[str, str] = None) -> Dict[str, Any]:

    flat = {}
    
    if schema is None:
        schema = {}

    # Nếu dữ liệu là dict
    if isinstance(data, dict):
        for key, value in data.items():
            # Tạo tiền tố mới cho key
            new_prefix = f"{prefix}{key}" if prefix else key

            # Nếu là dict hoặc list, làm phẳng
            if isinstance(value, (dict, list)):
                flat.update(flatten_json(value, f"{new_prefix}.", schema))
            else:
                # Nếu là kiểu dữ liệu cơ bản, thêm vào dict
                flat[new_prefix] = value

    # Nếu là một danh sách và không rỗng
    elif isinstance(data, list) and data:
        # Lưu danh sách vào dictionary với key là tiền tố (bỏ dấu '.')
        flat[prefix.rstrip('.')] = data

    return flat

In [None]:
""" CREATE EMBEDDDING """

def create_embedding(texts, batch_size=32):
    try:
        embeddings = model.encode(texts, batch_size=batch_size, convert_to_tensor=True, device=device)
        return embeddings
    except RuntimeError as e:
        if "CUDA out of memory" in str(e):
            print("VRAM overflow. Switching to CPU.")
            model.to("cpu")
            return model.encode(texts, batch_size=batch_size, convert_to_tensor=True, device="cpu")
        raise e

In [None]:
""" CREATE EMBEDDDINGS """

def create_embeddings(data: Any, schema: Dict[str, str], model, device: torch.device, merge: str = "NO") -> Dict[str, Any]:
    flat_data = flatten_json(data, schema=schema)
    embeddings = {}
    
    if MERGE == "Merge":
        # Gộp tất cả các trường
        merged_texts = []
        for key, value in flat_data.items():
            if schema.get(key) in ["string", "array"]:
                if isinstance(value, str) and value.strip():
                    merged_texts.append(preprocess_text(value))
                elif isinstance(value, list):
                    text = "\n".join([preprocess_text(str(item)) for item in value if str(item).strip()])
                    if text.strip():
                        merged_texts.append(text)
        if merged_texts:
            # Tạo embedding
            merged_text = "\n".join(merged_texts)
            if merged_text.strip():
                embedding = create_embedding(merged_text).to(device)
                embeddings["merged_embedding"] = embedding
    else:
        # Embeddingriêng lẻ
        for key, value in flat_data.items():
            if schema.get(key) in ["string", "array"]:
                if isinstance(value, str) and value.strip():
                    embedding = create_embedding(preprocess_text(value)).to(device)
                    embeddings[f"{key} Embedding"] = embedding
                elif isinstance(value, list):
                    text = "\n".join([preprocess_text(str(item)) for item in value if str(item).strip()])
                    if text.strip():
                        embedding = create_embedding(text).to(device)
                        embeddings[f"{key} Embedding"] = embedding

    # Kết hợp dữ liệu gốc và embedding
    if MERGE == "Merge":
        result = [{} for _ in range(len(data))] if isinstance(data, list) else {}
    else:
        result = data.copy()

    for embed_key, embed_value in embeddings.items():
        if MERGE == "Merge":
            result["Merged_text"] = merged_text
            result["Merged_embedding"] = embed_value.tolist()
        else:
            keys = embed_key.split(" Embedding")[0].split('.')
            current = result
            for i, k in enumerate(keys[:-1]):
                current = current.setdefault(k, {})
            current[keys[-1] + " Embedding"] = embed_value.tolist()
    return result

In [None]:
""" JSON EMBEDDDING """

def json_embeddings(json_file_path: str, 
                    torch_path: str, 
                    schema_path: str, 
                    model, 
                    device: torch.device, 
                    DATA_KEY: str, 
                    EMBE_KEY: str) -> None:
    
    # Kiểm tra nếu file embedding đã tồn tại
    if os.path.exists(torch_path):
        print(f"\nEmbedding loaded from {torch_path}\n")
        return

    print(f"\nCreating embeddings for JSON data...\n")
    try:
        # Đọc schema
        schema = load_schema(schema_path)
        if not schema:
            raise ValueError("Schema is empty or invalid")

        # Đọc file JSON
        with open(json_file_path, 'r', encoding='utf-8') as f:
            data_pairs = json.load(f)

        if not isinstance(data_pairs, list):
            data_pairs = [data_pairs]

        # Xử lý từng bộ JSON
        output_data = []
        for data in data_pairs:
            # Tiền xử lý văn bản
            flat_data = flatten_json(data)
            for key, value in flat_data.items():
                if isinstance(value, (str, list)):
                    flat_data[key] = preprocess_text(value)
            
            # Khôi phục cấu trúc gốc với dữ liệu đã tiền xử lý
            processed_data = data.copy()
            for key, value in flat_data.items():
                keys = key.split('.')
                current = processed_data
                for k in keys[:-1]:
                    current = current[k]
                current[keys[-1]] = value

            # Tạo embedding
            result = create_embeddings(processed_data, schema, model, device)
            output_data.append(result)

        # Lưu embedding riêng vào file .pt
        embeddings_only = []
        # datas_only =[]
        for item in output_data:
            flat_item = flatten_json(item)
            if MERGE == "Merge":
                # data_dict = {k: v for k, v in flat_item.items() if not k == "Merged_embedding"}
                embed_dict = {k: v for k, v in flat_item.items() if k == "Merged_embedding"}
            else:
                # data_dict = {k: v for k, v in flat_item.items() if not k.endswith("Embedding")}
                embed_dict = {k: v for k, v in flat_item.items() if k.endswith("Embedding")}
            # datas_only.append(data_dict)
            embeddings_only.append(embed_dict)


        torch.save({
            f"{DATA_KEY}": output_data,
            f"{EMBE_KEY}": embeddings_only
        }, torch_path)

        print(f"Embedding tensor saved to {torch_path}")

    except Exception as e:
        print(f"Error processing JSON with embeddings: {e}")
        raise

In [None]:
""""" MAIN - CREATE SCHEMA & EMBEDDDING """""

if os.path.exists(json_file_path):
    json_embeddings(json_file_path, torch_path, schema_ex_path, model, device, DATA_KEY, EMBE_KEY)
else:
    print(f"JSON path does not exist: {json_file_path}")

In [None]:
""" CHECK EMBEDDDING CONTRUCTION """

def print_json(pt_path: str) -> None:
    try:
        if not os.path.exists(pt_path):
            print(f"File không tồn tại: {pt_path}")
            return

        data = torch.load(pt_path, map_location="cpu", weights_only=False)

        if isinstance(data, dict) and f"{DATA_KEY}" in data:
            content = data[f"{DATA_KEY}"]
        else:
            print("Dữ liệu không đúng định dạng: không tìm thấy key 'content'")
            return

        if not isinstance(content, list) or not content:
            print("Dữ liệu rỗng hoặc không phải danh sách")
            return

        first_json = content[0]

        def process_json(obj: any) -> any:
            if isinstance(obj, dict):
                return {k: process_json(v) for k, v in obj.items()}
            elif isinstance(obj, list) and all(isinstance(x, (float, int)) for x in obj):
                return len(obj)
            elif isinstance(obj, list):
                return [process_json(item) for item in obj]
            return obj

        processed_json = process_json(first_json)

        print(json.dumps(processed_json, ensure_ascii=False, indent=2))

    except Exception as e:
        print(f"Lỗi khi đọc file .pt: {str(e)}")

print_json(torch_path)