# Module

In [2]:
import warnings
warnings.filterwarnings(action='ignore')
import os
import sys
import dotenv
import nest_asyncio
dotenv.load_dotenv()
import requests
import json
from glob import glob
import time
import pandas as pd
from tqdm import tqdm
from urllib.parse import unquote

# Setting

In [31]:
api_key = api_key = os.getenv("UPSTAGE_API_KEY")
file_name_list = glob("../data/nursing/*.pdf")
headers = {"Authorization": f"Bearer {api_key}"}
document_parser_url = "https://api.upstage.ai/v1/document-ai/async/document-parse"
check_url = "https://api.upstage.ai/v1/document-ai/requests"


# Parsing

In [None]:
# document_file_name,request_id, download_url 이 컬럼인 df를 만들어서 csv로 저장 하는 로직 구현
df = []
for file_name in tqdm(file_name_list):
    try:
        print(f"{file_name} - Start!")
        files = {"document": open(file_name, "rb")}
        data = {
            "ocr": "force",
            "output_formats": '["markdown", "html", "text"]'
            }
        api_response = requests.post(document_parser_url, headers=headers, files=files, data=data)
        while True:
            check_response = requests.get(os.path.join(check_url, api_response.json()["request_id"]), headers=headers)
            is_completed = False
            for batch in check_response.json()["batches"]:
                if batch["status"] == "completed":
                    is_completed = True
                else:
                    is_completed = False
                    break
            if is_completed:
                print(f"{file_name} - Completed!")
                break
            else:
                time.sleep(10)
        for batch in check_response.json()["batches"]:
            df.append([file_name, api_response.json()["request_id"], batch])
        print(f"==============================================")
    except Exception as e:
        print(f"{file_name} - error")
        print(e)
        continue

# df를 csv로 저장
df = pd.DataFrame(df, columns=["document_file_name", "request_id", "batch"])
df.to_csv("../data/nursing/parse_result_3.csv", index=False)

# Check Result

##### Check request_id

In [8]:
check_response = requests.get(check_url, headers=headers).json()["requests"]

##### Downloading 함수 정의

In [25]:
def download_file(url, folder_path):
    # 폴더가 존재하지 않으면 생성
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # 파일 다운로드 시작
    response = requests.get(url, stream=True)
    response.raise_for_status()  # 다운로드가 실패했는지 확인

    file_name = url.split("/")[-1]
    # 파일명을 한글 인코딩을 고려하여 디코딩
    file_name = unquote(file_name).split("?")[0]


    # 저장 경로 지정
    file_path = os.path.join(folder_path, file_name)

    # 파일 저장
    with open(file_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)
    print(f"File saved at {file_path}")


##### Downloading

In [None]:
folder_path = "./downloads"

for check_response_i in check_response:
    if check_response_i["status"] == "completed":
        batch_results = requests.get(os.path.join(check_url, check_response_i["id"]), headers=headers).json()["batches"]
        for batch_result in batch_results:
            download_file(batch_result["download_url"], folder_path)
        print("Completed")
    else:
        print("Not completed yet")
        continue