In [3]:
import re
from underthesea import sent_tokenize, word_tokenize
import json

#for module import
import sys, os
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from shared_functions.global_functions import *

In [4]:
#Batch converting from doc to pdf

folder_path = "D:/Study/Education/Projects/Group_Project/source/document/original_doc"
files = os.listdir(folder_path)

for f in files:
    doc_to_pdf(f'{folder_path}/{f}', f'D:/Study/Education/Projects/Group_Project/source/document')

In [8]:
from underthesea import word_tokenize

In [None]:
CACHE_PATH = "D:/Study/Education/Projects/Group_Project/source/cache_processed.json"
ANNOTATE_DIR = "D:/Study/Education/Projects/Group_Project/source/data/annotate_text"
PDF_PATH = "D:/Study/Education/Projects/Group_Project/source/document"

# --- Cache Management ---
def load_cache():
    """Load cache of processed files."""
    if os.path.exists(CACHE_PATH):
        with open(CACHE_PATH, "r", encoding="utf-8") as f:
            return set(json.load(f))
    return set()

def save_cache(cache):
    """Save cache of processed files."""
    with open(CACHE_PATH, "w", encoding="utf-8") as f:
        json.dump(list(cache), f, ensure_ascii=False, indent=2)


# --- Main Batch Process ---
def batch_process_files():
    cache = load_cache()
    processed = 0

    # Get local and S3 file lists
    local_pdfs = [f for f in os.listdir(PDF_PATH) if f.lower().endswith(".pdf")]
    s3_files = set(list_files_recursive())  # already uploaded files

    print(f"üóÇ Found {len(local_pdfs)} local PDFs.")
    print(f"‚òÅÔ∏è Found {len(s3_files)} files in S3.")

    for filename in local_pdfs:
        # 1Ô∏è‚É£ Upload if not in S3 yet
        if filename not in s3_files:
            try:
                local_path = f'{PDF_PATH}/{filename}'
                upload_file_to_s3(local_path)
                print(f"‚úÖ Uploaded: {filename} to S3")
            except Exception as e:
                print(f"‚ö†Ô∏è Failed to upload {filename}: {e}")
                continue  # Skip further processing for this file

        # 2Ô∏è‚É£ Skip if already processed (cached)
        if filename in cache:
            print(f"‚úÖ Skipping (cached): {filename}")
            continue

        # 3Ô∏è‚É£ Pull text from S3 and process
        try:
            doc = get_text_from_s3(f"legaldocstorage/{filename}")
            tokenize_sent = sent_tokenize(doc)

            if not tokenize_sent:
                print(f"‚ö†Ô∏è Empty text extracted for {filename}")
                continue

            # Save first sentence
            save_path = os.path.join(ANNOTATE_DIR, f"{filename}.txt")
            os.makedirs(os.path.dirname(save_path), exist_ok=True)
            with open(save_path, "w", encoding="utf-8") as f:
                f.write(tokenize_sent[0] + "\n")

            # Mark as processed
            cache.add(filename)
            processed += 1

            # Periodically save cache
            if processed % 10 == 0:
                save_cache(cache)

            print(f"‚úÖ Processed: {filename}")

        except Exception as e:
            print(f"‚ö†Ô∏è Error processing {filename}: {e}")

    # Final cache save
    save_cache(cache)
    print(f"\nüéâ Done! {processed} new files processed, total cached: {len(cache)}")

batch_process_files()

üóÇ Found 22 local PDFs.
‚òÅÔ∏è Found 22 files in S3.
‚úÖ Skipping (cached): khai_thue_tncn.pdf
‚úÖ Skipping (cached): luat_doanh_nghiep_2020.pdf
‚úÖ Skipping (cached): luat_doanh_nghiep_2025.pdf
‚úÖ Skipping (cached): luat_thue_gtgt_2024.pdf
‚úÖ Skipping (cached): luat_thue_gtgt_ttdb_quanly_thue_suadoi_2016.pdf
‚úÖ Skipping (cached): luat_thue_tndn_2025.pdf
‚úÖ Skipping (cached): luat_thue_ttdb_2025.pdf
‚úÖ Skipping (cached): luat_thue_xnk_2016.pdf
‚úÖ Uploaded D:/Study/Education/Projects/Group_Project/source/document/nghi_dinh_duan_ap_dung_cac_loai_hopdong.pdf to s3://legaldocstorage/nghi_dinh_duan_ap_dung_cac_loai_hopdong.pdf
‚úÖ Uploaded: nghi_dinh_duan_ap_dung_cac_loai_hopdong.pdf to S3
‚úÖ Processed: nghi_dinh_duan_ap_dung_cac_loai_hopdong.pdf
‚úÖ Skipping (cached): nghi_dinh_giahan_thoi_han_nop_thue_gtgt_2025.pdf
‚úÖ Skipping (cached): nghi_dinh_huongdan_VAT_2025.pdf
‚úÖ Uploaded D:/Study/Education/Projects/Group_Project/source/document/nghi_dinh_quydinh_quan_ly_thanhtoan.pdf t