In [36]:
import os
import json
import logging
from datetime import datetime
from elasticsearch import Elasticsearch, helpers

In [37]:
OCR_RESULTS_ROOT = "/Volumes/Transcend/AIC/AIO-AIClosers/PublicData/result"
ES_HOST = "127.0.0.1"
ES_PORT = 9200
ES_INDEX = "ocr_extractions"

In [38]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [39]:
def create_es_client():
    es = Elasticsearch([{
        "host": ES_HOST,
        "port": ES_PORT,
        "scheme": "http"
    }])
    if not es.ping():
        raise RuntimeError("Cannot connect to Elasticsearch")
    logger.info("Connected to Elasticsearch at %s:%s", ES_HOST, ES_PORT)
    return es

In [40]:
def create_ocr_index(es_client):
    if es_client.indices.exists(index=ES_INDEX):
        logger.info(f"Index {ES_INDEX} already exists. Skipping creation.")
        return
    mapping = {
        "mappings": {
            "properties": {
                "image_filename": {"type": "keyword"},
                "image_path": {"type": "keyword"},
                "processing_timestamp": {"type": "date"},
                "ocr_results": {
                    "type": "nested",
                    "properties": {
                        "text": {"type": "text"},
                        "confidence": {"type": "float"},
                        "bbox": {
                            "properties": {
                                "x1": {"type": "integer"},
                                "y1": {"type": "integer"},
                                "x2": {"type": "integer"},
                                "y2": {"type": "integer"}
                            }
                        }
                    }
                },
                "extracted_text_full": {"type": "text"},
                "total_confidence": {"type": "float"},
                "processing_status": {"type": "keyword"},
                "error_message": {"type": "text"}
            }
        }
    }
    es_client.indices.create(index=ES_INDEX, body=mapping)
    logger.info(f"Created new index: {ES_INDEX}")

In [41]:
def load_valid_json(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read().strip()
            if not content or content == "{}":
                return None
            return json.loads(content)
    except Exception as e:
        logger.warning(f"⚠️ Lỗi đọc JSON {file_path}: {e}")
        return None

In [42]:
def index_json_files(es_client):
    for level in range(1, 13):
        dir_name = f"ocr_results_L{level:02}"
        full_path = os.path.join(OCR_RESULTS_ROOT, dir_name)
        if not os.path.isdir(full_path):
            logger.warning(f"Ignore non-existent directory: {full_path}")
            continue

        logger.info(f"Processing directory: {dir_name}")
        actions = []
        for file_name in os.listdir(full_path):
            if not file_name.endswith(".json"):
                continue
            file_path = os.path.join(full_path, file_name)
            data = load_valid_json(file_path)
            if not data:
                continue

            doc = {
                "_index": ES_INDEX,
                "_source": {
                    "image_filename": file_name.replace(".json", ""),
                    "image_path": file_path,
                    "processing_timestamp": datetime.utcnow(),
                    "ocr_results": data.get("ocr_results", []),
                    "extracted_text_full": data.get("extracted_text_full", ""),
                    "total_confidence": data.get("total_confidence", 0),
                    "processing_status": data.get("processing_status", "unknown"),
                    "error_message": data.get("error_message", None)
                }
            }
            actions.append(doc)

        if actions:
            helpers.bulk(es_client, actions)
            logger.info(f"Successfully indexed {len(actions)} documents from {dir_name}")
        else:
            logger.info(f"No valid files found in {dir_name}")


In [43]:
if __name__ == "__main__":
    es = create_es_client()
    create_ocr_index(es)
    index_json_files(es)
    logger.info("Done !")

INFO:elastic_transport.transport:HEAD http://127.0.0.1:9200/ [status:200 duration:0.009s]
INFO:__main__:Connected to Elasticsearch at 127.0.0.1:9200
INFO:elastic_transport.transport:HEAD http://127.0.0.1:9200/ocr_extractions [status:200 duration:0.003s]
INFO:__main__:Index ocr_extractions already exists. Skipping creation.
INFO:__main__:Processing directory: ocr_results_L01
  "processing_timestamp": datetime.utcnow(),


AttributeError: 'list' object has no attribute 'get'

In [69]:
!curl -X GET "http://localhost:9200/_cat/indices?v"


health status index           uuid                   pri rep docs.count docs.deleted store.size pri.store.size dataset.size
yellow open   ocr_extractions -6_xmu7sQveYmuosOSi01w   1   1          3            0      7.2kb          7.2kb        7.2kb


In [76]:
!curl -X DELETE http://localhost:9200/ocr_extractions


{"acknowledged":true}

In [80]:
from elasticsearch import Elasticsearch
import json

es = Elasticsearch("http://localhost:9200")

def search_ocr_text_flat(index_name, search_text, max_docs=100, max_hits_per_doc=100):
    search_text = search_text.lower()

    # nested query
    query = {
        "nested": {
            "path": "ocr_results",
            "query": {
                "match": {
                    "ocr_results.text": {
                        "query": search_text,
                        "operator": "and",
                        "fuzziness": "AUTO"
                    }
                }
            },
            "inner_hits": {
                "size": max_hits_per_doc
            }
        }
    }

    resp = es.search(index=index_name, query=query, size=max_docs)
    out = []

    for doc in resp["hits"]["hits"]:
        filename = doc["_source"].get("image_filename")   # L01_V003
        nested_hits = doc.get("inner_hits", {})\
                         .get("ocr_results", {})\
                         .get("hits", {})\
                         .get("hits", [])
        for nh in nested_hits:
            src = nh.get("_source", {})
            # lấy text/conf/bbox trực tiếp
            text       = src.get("text")
            confidence = src.get("confidence")
            bbox       = src.get("bbox")
            image      = src.get("image")  # tên ảnh như "0001.jpg"

            if text is None:
                continue

            out.append({
                "text":       text,
                "confidence": confidence,
                "bbox":       bbox,
                "image":      image,
                "filename":   filename
            })

    # sắp xếp theo độ tin cậy giảm dần
    return sorted(out, key=lambda x: - (x["confidence"] or 0))


if __name__ == "__main__":
    result = search_ocr_text_flat("ocr_extractions", "giay")
    print(json.dumps(result, indent=2, ensure_ascii=False))


INFO:elastic_transport.transport:POST http://localhost:9200/ocr_extractions/_search [status:200 duration:0.494s]


[
  {
    "text": "giay",
    "confidence": 0.999041736125946,
    "bbox": {
      "x1": 904,
      "y1": 215,
      "x2": 974,
      "y2": 263
    },
    "image": "0119.jpg",
    "filename": "L01_V003"
  },
  {
    "text": "giay",
    "confidence": 0.9989584684371948,
    "bbox": {
      "x1": 917,
      "y1": 232,
      "x2": 983,
      "y2": 272
    },
    "image": "0196.jpg",
    "filename": "L01_V002"
  },
  {
    "text": "giay",
    "confidence": 0.9989184737205505,
    "bbox": {
      "x1": 443,
      "y1": 264,
      "x2": 529,
      "y2": 320
    },
    "image": "0225.jpg",
    "filename": "L01_V001"
  },
  {
    "text": "giay",
    "confidence": 0.9988574981689453,
    "bbox": {
      "x1": 904,
      "y1": 215,
      "x2": 975,
      "y2": 263
    },
    "image": "0191.jpg",
    "filename": "L01_V003"
  },
  {
    "text": "giay",
    "confidence": 0.9988523721694946,
    "bbox": {
      "x1": 904,
      "y1": 215,
      "x2": 975,
      "y2": 263
    },
    "image": "0137.jp

In [None]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
import json

es = Elasticsearch("http://localhost:9200")

def get_all_documents(index_name):
    results = []

    for doc in scan(es, index=index_name):
        results.append(doc["_source"])

    return results

if __name__ == "__main__":
    all_data = get_all_documents("ocr_extractions")
    print(json.dumps(all_data, indent=2, ensure_ascii=False))


INFO:elastic_transport.transport:POST http://localhost:9200/ocr_extractions/_search?scroll=5m [status:200 duration:0.029s]
INFO:elastic_transport.transport:POST http://localhost:9200/_search/scroll [status:200 duration:0.003s]
INFO:elastic_transport.transport:DELETE http://localhost:9200/_search/scroll [status:200 duration:0.002s]


[
  {
    "image_filename": "L01_V001",
    "image_path": "/Volumes/Transcend/AIC/AIO-AIClosers/PublicData/result/ocr_results_L01/L01_V001.json",
    "processing_timestamp": "2025-07-26T12:26:20.597954+00:00",
    "ocr_results": [
      {
        "image": "0001.jpg",
        "text": "giay",
        "confidence": 0.997688353061676,
        "bbox": {
          "x1": 688,
          "y1": 349,
          "x2": 871,
          "y2": 443
        }
      },
      {
        "image": "0002.jpg",
        "text": "giay",
        "confidence": 0.9979537725448608,
        "bbox": {
          "x1": 721,
          "y1": 512,
          "x2": 765,
          "y2": 537
        }
      },
      {
        "image": "0003.jpg",
        "text": "Oaay",
        "confidence": 0.5992012023925781,
        "bbox": {
          "x1": 405,
          "y1": 392,
          "x2": 449,
          "y2": 414
        }
      },
      {
        "image": "0003.jpg",
        "text": "PHAN NY",
        "confidence": 0.9323171377182

In [75]:
from elasticsearch import Elasticsearch
import json

es = Elasticsearch("http://localhost:9200")

def get_document_by_filename(index_name, filename):
    query = {
        "term": {
            "image_filename": filename
        }
    }

    response = es.search(index=index_name, query=query, size=1)

    results = []
    for hit in response["hits"]["hits"]:
        results.append(hit["_source"])

    return results

# Gọi thử
if __name__ == "__main__":
    result = get_document_by_filename("ocr_extractions", "L01_V003")
    print(json.dumps(result, indent=2, ensure_ascii=False))


INFO:elastic_transport.transport:POST http://localhost:9200/ocr_extractions/_search [status:200 duration:0.017s]


[
  {
    "image_filename": "L01_V003",
    "image_path": "/Volumes/Transcend/AIC/AIO-AIClosers/PublicData/result/ocr_results_L01/L01_V003.json",
    "processing_timestamp": "2025-07-26T12:24:08.895486+00:00",
    "ocr_results": [
      {
        "text": "giay",
        "confidence": 0.9976614713668823,
        "bbox": {
          "x1": 687,
          "y1": 348,
          "x2": 873,
          "y2": 444
        }
      },
      {
        "text": "giay",
        "confidence": 0.9806345701217651,
        "bbox": {
          "x1": 752,
          "y1": 520,
          "x2": 787,
          "y2": 536
        }
      },
      {
        "text": "ViNH PHU",
        "confidence": 0.881537675857544,
        "bbox": {
          "x1": 485,
          "y1": 390,
          "x2": 587,
          "y2": 417
        }
      },
      {
        "text": "MINH NGOC",
        "confidence": 0.9518093466758728,
        "bbox": {
          "x1": 737,
          "y1": 392,
          "x2": 864,
          "y2": 417
    