In [1]:
import os
import sys

PROJECT_PATH = os.path.dirname(os.getcwd())
sys.path.append(PROJECT_PATH)

In [2]:
print(PROJECT_PATH)

/home/a543979/hdegis-data-processor


In [3]:
import os
import sys
import json
import time
from typing import List, Tuple
from db.initialize import initialize_tables
from db.models import PageStatus
from db.session import get_db_session
from db.repository import Repository
from storage.gcs_client import GCSStorageClient
from processor.pdf_manager import PDFManager
from processor.elastic import ESConnector
from utils.logger import get_logger
from utils.utils import compute_doc_hash
from config import (
    GCS_SOURCE_BUCKET,
    GCS_PROCESSED_BUCKET,
    PROJECT_ID, 
    GENAI_LOCATION,
    ES_HOST,
    ES_USER,
    ES_PWD,
    INDEX_NAME,
    LOG_LEVEL
)
from google.cloud import storage
from google import genai

In [4]:
db_gen = get_db_session()
session = next(db_gen)

In [19]:
gcs_client = storage.Client()
storage_client = GCSStorageClient(GCS_SOURCE_BUCKET, GCS_PROCESSED_BUCKET, gcs_client)
repo = Repository(session)

In [20]:
known_doc_ids = repo.list_all_document_hashes()

In [21]:
len(known_doc_ids), type(known_doc_ids)

(66, set)

In [22]:
new_docs: List[Tuple[str, str]] = []

In [23]:
pdf_paths = storage_client.list_pdfs()

In [25]:
len(pdf_paths), type(pdf_paths)

(99, list)

In [26]:
"2. Type Test Reports/300SR/245 kV 63 kA MS (2024)/KEMA 3078-22 R0_Mechanical_IP_Pressure.pdf" in pdf_paths

True

In [27]:
"2. Type Test Reports/300SR/245 kV 63 kA MS (2024)/KERI 20TC100154_Insulator_Partition.PDF" in pdf_paths

True

In [28]:
from utils.utils import compute_doc_hash

In [29]:
for path in [
    "2. Type Test Reports/300SR/245 kV 63 kA MS (2024)/KEMA 3078-22 R0_Mechanical_IP_Pressure.pdf",
    "2. Type Test Reports/300SR/245 kV 63 kA MS (2024)/KERI 20TC100154_Insulator_Partition.PDF"
]:
    h = compute_doc_hash(storage_client, path)
    print(path, h)


2. Type Test Reports/300SR/245 kV 63 kA MS (2024)/KEMA 3078-22 R0_Mechanical_IP_Pressure.pdf 9319523986eaa6a4e0cbe86119377bbdc654000477acc2c83b5674a2599b4159
2. Type Test Reports/300SR/245 kV 63 kA MS (2024)/KERI 20TC100154_Insulator_Partition.PDF 03eda2a04f1d0774c6bcd07f5063b5df61b2e7472ca1d23a882d8005342a63e2


In [35]:
h1 = compute_doc_hash(storage_client,"2. Type Test Reports/300SR/245 kV 63 kA MS (2024)/KEMA 3078-22 R0_Mechanical_IP_Pressure.pdf")

In [36]:
h2 = compute_doc_hash(storage_client,"2. Type Test Reports/300SR/245 kV 63 kA MS (2024)/KERI 20TC100154_Insulator_Partition.PDF")

In [37]:
h1 == h2

False

In [38]:
h1 == "9319523986eaa6a4e0cbe86119377bbdc654000477acc2c83b5674a2599b4159"

True

In [5]:
db_gen = get_db_session()
session = next(db_gen)

In [6]:
gcs_client = storage.Client()
storage_client = GCSStorageClient(GCS_SOURCE_BUCKET, GCS_PROCESSED_BUCKET, gcs_client)
repo = Repository(session)
genai_client = genai.Client(vertexai=True, project=PROJECT_ID, location=GENAI_LOCATION)
els = ESConnector(hosts=ES_HOST, credentials=(ES_USER, ES_PWD))

manager = PDFManager(storage_client, repo, genai_client, els)

  _transport = transport_class(


In [7]:
pdf_paths = storage_client.list_pdfs()

In [8]:
print("Found %d PDF files in GCS", len(pdf_paths))

Found %d PDF files in GCS 99


In [9]:
known_doc_ids = repo.list_all_document_hashes()

In [10]:
new_docs: List[Tuple[str, str]] = []

In [11]:
for path in pdf_paths:
    try:
        doc_hash = compute_doc_hash(storage_client, path)
        if doc_hash not in known_doc_ids:
            new_docs.append((doc_hash, path))
    except Exception as e:
        print(" └── Hash computation failed for %s (%s)", path, e)

print(" └── Detected %d new documents", len(new_docs))

 └── Detected %d new documents 99


In [12]:
new_docs[:3]

[('32fd02100c080b28525b774e77809891589f04dbf183ea3f12f8b5adc4b20c28',
  '1. International Standards/IEC/IEC 60099-4_Edition 3.0_2014-06.pdf'),
 ('3d479578ab04b3cab400a22dd91bd4829d3875e5e0fe22552dc51f91b739afc8',
  '1. International Standards/IEC/IEC 60137_Edition 7.0_2017-06.pdf'),
 ('9f2aed2a7f66f3a902ae2d1eb91f99ecd06171343e351eb916782749b1ed9caf',
  '1. International Standards/IEC/IEC 60376_Edition 3.0_2018-05.pdf')]

In [13]:
new_docs_hash = [h for h, p in new_docs]

In [14]:
len(new_docs_hash), len(set(new_docs_hash))

(99, 99)

In [None]:
set()

In [15]:
from collections import defaultdict

hash_to_paths = defaultdict(list)

for path in pdf_paths:
    try:
        doc_hash = compute_doc_hash(storage_client, path)
        hash_to_paths[doc_hash].append(path)
        if doc_hash not in known_doc_ids:
            new_docs.append((doc_hash, path))
    except Exception as e:
        print(" └── Hash computation failed for %s (%s)" % (path, e))

# 중복된 hash들 출력
print(" └── Detected %d new documents" % len(new_docs))

duplicates = {h: p_list for h, p_list in hash_to_paths.items() if len(p_list) > 1}
if duplicates:
    print(" └── Found duplicate hashes:")
    for h, paths in duplicates.items():
        print(f"    - Hash {h} is shared by:")
        for p in paths:
            print(f"      • {p}")
else:
    print(" └── No duplicate doc_hash values found.")


 └── Detected 198 new documents
 └── No duplicate doc_hash values found.
