In [None]:
import importlib
import sys
import re
import os

from dotenv import load_dotenv


load_dotenv()

# 모든 database 관련 모듈 제거
modules_to_remove = [key for key in sys.modules.keys() if key.startswith('database')]
for module in modules_to_remove:
    del sys.modules[module]

# 다시 import
import database

from database.repository import AnncLhRepository, AnncQrRepository, AnncAllRepository, AnncFileRepository
lh_repo = AnncLhRepository()
qr_repo = AnncQrRepository()
all_repo = AnncAllRepository()
file_repo = AnncFileRepository()

In [None]:
batch_id = '6c71a04b-01e7-4b3f-a3e9-b5b9b1d85cf1'

In [None]:
# annc_list_lh = lh_repo.get_announcements(batch_id, batch_status='PENDING', annc_type=('분양','임대'))

annc_list_lh = qr_repo.get_announcements_merge_target(batch_id, annc_status='공고중')

print(f'총 {len(annc_list_lh)}건 조회됨')

In [None]:
annc_list_lh

In [None]:

# 테스트
annc_list_lh = annc_list_lh[0:1]

In [None]:
import requests
import json

def get_file_list(row):
    
    FILE_CALL_URL = "https://apply.lh.or.kr/lhapply/wt/wrtanc/wrtFileDownl.do"
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
        "Referer": "https://apply.lh.or.kr/"
    }

    # 파일 조회용 크롤링
    form_data_file = {
        'uppAisTpCd1': row['lh_upp_ais_tp_cd'],
        'aisTpCd1': row['lh_ais_tp_cd'],
        'ccrCnntSysDsCd1': row['lh_ccr_cnnt_sys_ds_cd'],
        'lsSst1': row['lh_ls_sst'],
        'panId1': row['lh_pan_id']
    }

    response = requests.post(FILE_CALL_URL, data=form_data_file, headers=HEADERS, timeout=15)
    response.encoding = 'utf-8'
    response.raise_for_status()

    file_list = json.loads(response.text)
    
    ok_list = ('공고문(PDF)', '정정공고문(PDF)')

    file_list = [obj for obj in file_list if obj['slPanAhflDsCdNm'] in ok_list]

    if len(file_list) == 0:
        raise Exception("파일 없음!")

    return file_list


In [None]:
import pymupdf
from pymupdf4llm import to_markdown

In [None]:
def clear_text(markdown_text):
    # 1. 모든 <br>을 공백으로 대체하거나 제거합니다.
    text = markdown_text.replace('<br>', ' ')

    # 2. \n을 공백으로 대체하여 문장이 하나로 이어지도록 합니다.
    text = text.replace('\n', ' ')

    # 3. 불필요한 연속된 공백을 단일 공백으로 압축합니다.
    clean_text = re.sub(r'\s+', ' ', text).strip()
    return clean_text

In [None]:
error_cnt = 0


for row_lh in annc_list_lh:
    try:

        # 1. 임시 테이블 상태 변경 -> 시작
        lh_repo.update_announcements('START', row_lh['batch_id'], row_lh['batch_seq'])

        # 2. 공고 테이블에 넣을 데이터 준비
        row_lh['corp_cd'] = 'LH'
        row_lh['service_status'] = 'CLOSE'
        merge_result = all_repo.merge_announcements([row_lh,]) # 원래 다건을 위한것
        
        if not merge_result:
            raise Exception("머지된 행 없음")
        # 1행만 쓰겠음
        merge_result = merge_result[0]        
        row_lh['annc_id'] = merge_result['annc_id']

        # 3. 파일 조회
        file_list = get_file_list(row_lh)

        if not file_list:
            raise Exception("파일 없음")
        

        # 4. 파일 등록
        for file_info in file_list:

            annc_file = {}

            annc_file['annc_id'] = row_lh['annc_id']
            annc_file['file_name'] = file_info['cmnAhflNm']
            annc_file['file_type'] = file_info['slPanAhflDsCdNm']
            annc_file['file_ext'] = 'pdf'

            # 4-1 파일 다운로드
            download_url = f'https://apply.lh.or.kr/lhapply/lhFile.do?fileid={file_info['cmnAhflSn']}'

            file_response = requests.get(
                download_url,
                stream=True,
                timeout=30,
                verify=False
            )
            file_response.raise_for_status()

            file_content = file_response.content

            if not file_content:
                raise Exception("파일 내용 없음")
            
            with open("./"+annc_file['file_name'], mode='wb') as f:
                f.write(file_content)
                
            
            # # print(file_content)
            # print('fitz.open 시작')

            # pdf_document = pymupdf.open(stream=file_content, filetype="pdf")
            
            # markdown_text = to_markdown(pdf_document)
            
            # pdf_document.close()
            
            annc_file['file_size'] = file_response.headers.get('Content-Length')

            # 4-2 파일 등록
            file_repo.bulk_insert_files([annc_file])


    except Exception as e:
        error_cnt += 1
        print(e)
        if error_cnt < 5:
            continue
        raise e





In [None]:
from llama_cloud_services import LlamaParse

lp_api_key = os.getenv("LLAMA_CLOUD_API_KEY")

parser = LlamaParse(
    api_key=lp_api_key,  # can also be set in your env as LLAMA_CLOUD_API_KEY
    num_workers=4,       # if multiple files passed, split in `num_workers` API calls
    verbose=True,
    language="ko",       # optionally define a language, default=en
)

result = parser.parse("./(정정)양주회천A25BL영구임대주택최초입주자모집공고문.pdf")

In [None]:
markdown_documents = result.get_markdown_documents(split_by_page=True)

In [None]:
markdown_documents[0].metadata

In [None]:
markdown_documents[0].text_resource.text

In [None]:
for md in markdown_documents:
    print(len(md.text_resource.text))

In [None]:
markdown_documents

In [None]:
# text-embedding-3-small

